blob: e0a666f70da3666b10d7edbdea7650d0aebbdfcb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner45876a92020-02-12 22:32:34 +010043#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020045#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010046#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020047#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040048#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010049#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000050#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600175 const from_type *_iter = (const from_type *)(begin);\
176 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600967 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200968 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600969 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600974 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600976 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600979 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200980 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600981 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001040#ifdef Py_REF_DEBUG
1041 _Py_RefTotal--;
1042#endif
1043#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001044 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001045#endif
Victor Stinner84def372011-12-11 20:04:56 +01001046
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001047 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001048 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001049 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 PyErr_NoMemory();
1051 return NULL;
1052 }
Victor Stinner84def372011-12-11 20:04:56 +01001053 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001057 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001059 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001060 _PyUnicode_WSTR_LENGTH(unicode) = length;
1061 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001062 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1063 PyObject_DEL(_PyUnicode_WSTR(unicode));
1064 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001065 if (!PyUnicode_IS_ASCII(unicode))
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001067 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001068#ifdef Py_DEBUG
1069 unicode_fill_invalid(unicode, old_length);
1070#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1072 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001073 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 return unicode;
1075}
1076
Alexander Belopolsky40018472011-02-26 01:02:56 +00001077static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079{
Victor Stinner95663112011-10-04 01:03:50 +02001080 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001081 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001084
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 if (PyUnicode_IS_READY(unicode)) {
1086 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001087 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001089#ifdef Py_DEBUG
1090 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1091#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092
1093 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001094 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001095 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1096 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097
1098 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1099 PyErr_NoMemory();
1100 return -1;
1101 }
1102 new_size = (length + 1) * char_size;
1103
Victor Stinner7a9105a2011-12-12 00:13:42 +01001104 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1105 {
1106 PyObject_DEL(_PyUnicode_UTF8(unicode));
1107 _PyUnicode_UTF8(unicode) = NULL;
1108 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1109 }
1110
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 data = (PyObject *)PyObject_REALLOC(data, new_size);
1112 if (data == NULL) {
1113 PyErr_NoMemory();
1114 return -1;
1115 }
1116 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001117 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_WSTR_LENGTH(unicode) = length;
1120 }
1121 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001122 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 _PyUnicode_UTF8_LENGTH(unicode) = length;
1124 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_LENGTH(unicode) = length;
1126 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001127#ifdef Py_DEBUG
1128 unicode_fill_invalid(unicode, old_length);
1129#endif
Victor Stinner95663112011-10-04 01:03:50 +02001130 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001131 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001133 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 }
Victor Stinner95663112011-10-04 01:03:50 +02001135 assert(_PyUnicode_WSTR(unicode) != NULL);
1136
1137 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001138 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001139 PyErr_NoMemory();
1140 return -1;
1141 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001142 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001143 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001144 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001145 if (!wstr) {
1146 PyErr_NoMemory();
1147 return -1;
1148 }
1149 _PyUnicode_WSTR(unicode) = wstr;
1150 _PyUnicode_WSTR(unicode)[length] = 0;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001152 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return 0;
1154}
1155
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156static PyObject*
1157resize_copy(PyObject *unicode, Py_ssize_t length)
1158{
1159 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001160 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001162
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001163 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001164
1165 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1166 if (copy == NULL)
1167 return NULL;
1168
1169 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001170 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001172 }
1173 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001174 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001175
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001176 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 if (w == NULL)
1178 return NULL;
1179 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1180 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001181 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001182 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 }
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001188 Ux0000 terminated; some code (e.g. new_identifier)
1189 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001192 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193
1194*/
1195
Alexander Belopolsky40018472011-02-26 01:02:56 +00001196static PyUnicodeObject *
1197_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001199 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
Thomas Wouters477c8d52006-05-27 19:21:47 +00001202 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 if (length == 0 && unicode_empty != NULL) {
1204 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001205 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 }
1207
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001208 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001209 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001210 return (PyUnicodeObject *)PyErr_NoMemory();
1211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 if (length < 0) {
1213 PyErr_SetString(PyExc_SystemError,
1214 "Negative size passed to _PyUnicode_New");
1215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1219 if (unicode == NULL)
1220 return NULL;
1221 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001222
1223 _PyUnicode_WSTR_LENGTH(unicode) = length;
1224 _PyUnicode_HASH(unicode) = -1;
1225 _PyUnicode_STATE(unicode).interned = 0;
1226 _PyUnicode_STATE(unicode).kind = 0;
1227 _PyUnicode_STATE(unicode).compact = 0;
1228 _PyUnicode_STATE(unicode).ready = 0;
1229 _PyUnicode_STATE(unicode).ascii = 0;
1230 _PyUnicode_DATA_ANY(unicode) = NULL;
1231 _PyUnicode_LENGTH(unicode) = 0;
1232 _PyUnicode_UTF8(unicode) = NULL;
1233 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1236 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001237 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001239 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241
Jeremy Hyltond8082792003-09-16 19:41:39 +00001242 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001243 * the caller fails before initializing str -- unicode_resize()
1244 * reads str[0], and the Keep-Alive optimization can keep memory
1245 * allocated for str alive across a call to unicode_dealloc(unicode).
1246 * We don't want unicode_resize to read uninitialized memory in
1247 * that case.
1248 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_WSTR(unicode)[0] = 0;
1250 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001251
Victor Stinner7931d9a2011-11-04 00:22:48 +01001252 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 return unicode;
1254}
1255
Victor Stinnerf42dc442011-10-02 23:33:16 +02001256static const char*
1257unicode_kind_name(PyObject *unicode)
1258{
Victor Stinner42dfd712011-10-03 14:41:45 +02001259 /* don't check consistency: unicode_kind_name() is called from
1260 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001261 if (!PyUnicode_IS_COMPACT(unicode))
1262 {
1263 if (!PyUnicode_IS_READY(unicode))
1264 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001265 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266 {
1267 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001268 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001269 return "legacy ascii";
1270 else
1271 return "legacy latin1";
1272 case PyUnicode_2BYTE_KIND:
1273 return "legacy UCS2";
1274 case PyUnicode_4BYTE_KIND:
1275 return "legacy UCS4";
1276 default:
1277 return "<legacy invalid kind>";
1278 }
1279 }
1280 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001281 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "ascii";
1285 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001288 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 default:
1292 return "<invalid compact kind>";
1293 }
1294}
1295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001298char *_PyUnicode_utf8(void *unicode_raw){
1299 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001300 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_compact_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return _PyUnicode_COMPACT_DATA(unicode);
1306}
Victor Stinnera42de742018-11-22 10:25:22 +01001307void *_PyUnicode_data(void *unicode_raw) {
1308 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001309 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1311 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1312 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1313 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1314 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1315 return PyUnicode_DATA(unicode);
1316}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001317
1318void
1319_PyUnicode_Dump(PyObject *op)
1320{
1321 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1324 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001325
Victor Stinnera849a4b2011-10-03 12:12:11 +02001326 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001327 {
1328 if (ascii->state.ascii)
1329 data = (ascii + 1);
1330 else
1331 data = (compact + 1);
1332 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 else
1334 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1336 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 if (ascii->wstr == data)
1339 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001340 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001341
Victor Stinnera3b334d2011-10-03 13:53:37 +02001342 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001343 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1345 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001346 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001348 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001350}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351#endif
1352
1353PyObject *
1354PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1355{
1356 PyObject *obj;
1357 PyCompactUnicodeObject *unicode;
1358 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001359 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001360 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 Py_ssize_t char_size;
1362 Py_ssize_t struct_size;
1363
1364 /* Optimization for empty strings */
1365 if (size == 0 && unicode_empty != NULL) {
1366 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001367 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
1369
Victor Stinner9e9d6892011-10-04 01:02:02 +02001370 is_ascii = 0;
1371 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 struct_size = sizeof(PyCompactUnicodeObject);
1373 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001374 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 char_size = 1;
1376 is_ascii = 1;
1377 struct_size = sizeof(PyASCIIObject);
1378 }
1379 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 1;
1382 }
1383 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 2;
1386 if (sizeof(wchar_t) == 2)
1387 is_sharing = 1;
1388 }
1389 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001390 if (maxchar > MAX_UNICODE) {
1391 PyErr_SetString(PyExc_SystemError,
1392 "invalid maximum character passed to PyUnicode_New");
1393 return NULL;
1394 }
Victor Stinner8f825062012-04-27 13:55:39 +02001395 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 char_size = 4;
1397 if (sizeof(wchar_t) == 4)
1398 is_sharing = 1;
1399 }
1400
1401 /* Ensure we won't overflow the size. */
1402 if (size < 0) {
1403 PyErr_SetString(PyExc_SystemError,
1404 "Negative size passed to PyUnicode_New");
1405 return NULL;
1406 }
1407 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1408 return PyErr_NoMemory();
1409
1410 /* Duplicated allocation code from _PyObject_New() instead of a call to
1411 * PyObject_New() so we are able to allocate space for the object and
1412 * it's data buffer.
1413 */
1414 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1415 if (obj == NULL)
1416 return PyErr_NoMemory();
1417 obj = PyObject_INIT(obj, &PyUnicode_Type);
1418 if (obj == NULL)
1419 return NULL;
1420
1421 unicode = (PyCompactUnicodeObject *)obj;
1422 if (is_ascii)
1423 data = ((PyASCIIObject*)obj) + 1;
1424 else
1425 data = unicode + 1;
1426 _PyUnicode_LENGTH(unicode) = size;
1427 _PyUnicode_HASH(unicode) = -1;
1428 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001429 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 _PyUnicode_STATE(unicode).compact = 1;
1431 _PyUnicode_STATE(unicode).ready = 1;
1432 _PyUnicode_STATE(unicode).ascii = is_ascii;
1433 if (is_ascii) {
1434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 }
Victor Stinner8f825062012-04-27 13:55:39 +02001437 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 ((char*)data)[size] = 0;
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 else {
1445 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001446 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001447 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001449 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 ((Py_UCS4*)data)[size] = 0;
1451 if (is_sharing) {
1452 _PyUnicode_WSTR_LENGTH(unicode) = size;
1453 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1454 }
1455 else {
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 }
1459 }
Victor Stinner8f825062012-04-27 13:55:39 +02001460#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001461 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001462#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001463 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 return obj;
1465}
1466
1467#if SIZEOF_WCHAR_T == 2
1468/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1469 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001470 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 This function assumes that unicode can hold one more code point than wstr
1473 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001474static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477{
1478 const wchar_t *iter;
1479 Py_UCS4 *ucs4_out;
1480
Victor Stinner910337b2011-10-03 03:20:16 +02001481 assert(unicode != NULL);
1482 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1484 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1485
1486 for (iter = begin; iter < end; ) {
1487 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1488 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001489 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1490 && (iter+1) < end
1491 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 {
Victor Stinner551ac952011-11-29 22:58:13 +01001493 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 iter += 2;
1495 }
1496 else {
1497 *ucs4_out++ = *iter;
1498 iter++;
1499 }
1500 }
1501 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1502 _PyUnicode_GET_LENGTH(unicode)));
1503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504}
1505#endif
1506
Victor Stinnercd9950f2011-10-02 00:34:53 +02001507static int
Victor Stinner488fa492011-12-12 00:01:39 +01001508unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509{
Victor Stinner488fa492011-12-12 00:01:39 +01001510 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001511 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001512 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001513 return -1;
1514 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515 return 0;
1516}
1517
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518static int
1519_copy_characters(PyObject *to, Py_ssize_t to_start,
1520 PyObject *from, Py_ssize_t from_start,
1521 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001523 unsigned int from_kind, to_kind;
1524 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinneree4544c2012-05-09 22:24:08 +02001526 assert(0 <= how_many);
1527 assert(0 <= from_start);
1528 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001531 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinnerd3f08822012-05-29 12:57:52 +02001533 assert(PyUnicode_Check(to));
1534 assert(PyUnicode_IS_READY(to));
1535 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1536
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001537 if (how_many == 0)
1538 return 0;
1539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001541 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001543 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerf1852262012-06-16 16:38:26 +02001545#ifdef Py_DEBUG
1546 if (!check_maxchar
1547 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1548 {
1549 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1550 Py_UCS4 ch;
1551 Py_ssize_t i;
1552 for (i=0; i < how_many; i++) {
1553 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1554 assert(ch <= to_maxchar);
1555 }
1556 }
1557#endif
1558
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001560 if (check_maxchar
1561 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1562 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001563 /* Writing Latin-1 characters into an ASCII string requires to
1564 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001565 Py_UCS4 max_char;
1566 max_char = ucs1lib_find_max_char(from_data,
1567 (Py_UCS1*)from_data + how_many);
1568 if (max_char >= 128)
1569 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 }
Christian Heimesf051e432016-09-13 20:22:02 +02001571 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001572 (char*)from_data + from_kind * from_start,
1573 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001575 else if (from_kind == PyUnicode_1BYTE_KIND
1576 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001577 {
1578 _PyUnicode_CONVERT_BYTES(
1579 Py_UCS1, Py_UCS2,
1580 PyUnicode_1BYTE_DATA(from) + from_start,
1581 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1582 PyUnicode_2BYTE_DATA(to) + to_start
1583 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001584 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001585 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001586 && to_kind == PyUnicode_4BYTE_KIND)
1587 {
1588 _PyUnicode_CONVERT_BYTES(
1589 Py_UCS1, Py_UCS4,
1590 PyUnicode_1BYTE_DATA(from) + from_start,
1591 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1592 PyUnicode_4BYTE_DATA(to) + to_start
1593 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 }
1595 else if (from_kind == PyUnicode_2BYTE_KIND
1596 && to_kind == PyUnicode_4BYTE_KIND)
1597 {
1598 _PyUnicode_CONVERT_BYTES(
1599 Py_UCS2, Py_UCS4,
1600 PyUnicode_2BYTE_DATA(from) + from_start,
1601 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1602 PyUnicode_4BYTE_DATA(to) + to_start
1603 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001604 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001605 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001606 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1607
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001608 if (!check_maxchar) {
1609 if (from_kind == PyUnicode_2BYTE_KIND
1610 && to_kind == PyUnicode_1BYTE_KIND)
1611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS2, Py_UCS1,
1614 PyUnicode_2BYTE_DATA(from) + from_start,
1615 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_1BYTE_DATA(to) + to_start
1617 );
1618 }
1619 else if (from_kind == PyUnicode_4BYTE_KIND
1620 && to_kind == PyUnicode_1BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS4, Py_UCS1,
1624 PyUnicode_4BYTE_DATA(from) + from_start,
1625 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_1BYTE_DATA(to) + to_start
1627 );
1628 }
1629 else if (from_kind == PyUnicode_4BYTE_KIND
1630 && to_kind == PyUnicode_2BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS4, Py_UCS2,
1634 PyUnicode_4BYTE_DATA(from) + from_start,
1635 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_2BYTE_DATA(to) + to_start
1637 );
1638 }
1639 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001640 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 }
1642 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001643 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 Py_ssize_t i;
1647
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 for (i=0; i < how_many; i++) {
1649 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 if (ch > to_maxchar)
1651 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1653 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 }
1655 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return 0;
1657}
1658
Victor Stinnerd3f08822012-05-29 12:57:52 +02001659void
1660_PyUnicode_FastCopyCharacters(
1661 PyObject *to, Py_ssize_t to_start,
1662 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001663{
1664 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1665}
1666
1667Py_ssize_t
1668PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1669 PyObject *from, Py_ssize_t from_start,
1670 Py_ssize_t how_many)
1671{
1672 int err;
1673
1674 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1675 PyErr_BadInternalCall();
1676 return -1;
1677 }
1678
Benjamin Petersonbac79492012-01-14 13:34:47 -05001679 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001680 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001681 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001682 return -1;
1683
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001689 PyErr_SetString(PyExc_IndexError, "string index out of range");
1690 return -1;
1691 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if (how_many < 0) {
1693 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1694 return -1;
1695 }
1696 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1698 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001699 "Cannot write %zi characters at %zi "
1700 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001701 how_many, to_start, PyUnicode_GET_LENGTH(to));
1702 return -1;
1703 }
1704
1705 if (how_many == 0)
1706 return 0;
1707
Victor Stinner488fa492011-12-12 00:01:39 +01001708 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
1710
1711 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1712 if (err) {
1713 PyErr_Format(PyExc_SystemError,
1714 "Cannot copy %s characters "
1715 "into a string of %s characters",
1716 unicode_kind_name(from),
1717 unicode_kind_name(to));
1718 return -1;
1719 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001720 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721}
1722
Victor Stinner17222162011-09-28 22:15:37 +02001723/* Find the maximum code point and count the number of surrogate pairs so a
1724 correct string length can be computed before converting a string to UCS4.
1725 This function counts single surrogates as a character and not as a pair.
1726
1727 Return 0 on success, or -1 on error. */
1728static int
1729find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1730 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731{
1732 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001733 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Victor Stinnerc53be962011-10-02 21:33:54 +02001735 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 *num_surrogates = 0;
1737 *maxchar = 0;
1738
1739 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001741 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1742 && (iter+1) < end
1743 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1744 {
1745 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1746 ++(*num_surrogates);
1747 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 }
1749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001751 {
1752 ch = *iter;
1753 iter++;
1754 }
1755 if (ch > *maxchar) {
1756 *maxchar = ch;
1757 if (*maxchar > MAX_UNICODE) {
1758 PyErr_Format(PyExc_ValueError,
1759 "character U+%x is not in range [U+0000; U+10ffff]",
1760 ch);
1761 return -1;
1762 }
1763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 }
1765 return 0;
1766}
1767
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001768int
1769_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770{
1771 wchar_t *end;
1772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774#if SIZEOF_WCHAR_T == 2
1775 Py_ssize_t length_wo_surrogates;
1776#endif
1777
Georg Brandl7597add2011-10-05 16:36:47 +02001778 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001779 strings were created using _PyObject_New() and where no canonical
1780 representation (the str field) has been set yet aka strings
1781 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001782 assert(_PyUnicode_CHECK(unicode));
1783 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 /* Actually, it should neither be interned nor be anything else: */
1788 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001791 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794
1795 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001796 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1797 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 PyErr_NoMemory();
1799 return -1;
1800 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001801 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 _PyUnicode_WSTR(unicode), end,
1803 PyUnicode_1BYTE_DATA(unicode));
1804 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1805 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1806 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1807 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001808 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001809 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 }
1812 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001813 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 _PyUnicode_UTF8(unicode) = NULL;
1815 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 }
1817 PyObject_FREE(_PyUnicode_WSTR(unicode));
1818 _PyUnicode_WSTR(unicode) = NULL;
1819 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1820 }
1821 /* In this case we might have to convert down from 4-byte native
1822 wchar_t to 2-byte unicode. */
1823 else if (maxchar < 65536) {
1824 assert(num_surrogates == 0 &&
1825 "FindMaxCharAndNumSurrogatePairs() messed up");
1826
Victor Stinner506f5922011-09-28 22:34:18 +02001827#if SIZEOF_WCHAR_T == 2
1828 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001830 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1831 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1832 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001835#else
1836 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001838 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001839 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyErr_NoMemory();
1841 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 }
Victor Stinner506f5922011-09-28 22:34:18 +02001843 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1844 _PyUnicode_WSTR(unicode), end,
1845 PyUnicode_2BYTE_DATA(unicode));
1846 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1847 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1848 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001849 _PyUnicode_UTF8(unicode) = NULL;
1850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001851 PyObject_FREE(_PyUnicode_WSTR(unicode));
1852 _PyUnicode_WSTR(unicode) = NULL;
1853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1854#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1857 else {
1858#if SIZEOF_WCHAR_T == 2
1859 /* in case the native representation is 2-bytes, we need to allocate a
1860 new normalized 4-byte version. */
1861 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001862 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1863 PyErr_NoMemory();
1864 return -1;
1865 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001866 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1867 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyErr_NoMemory();
1869 return -1;
1870 }
1871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001873 _PyUnicode_UTF8(unicode) = NULL;
1874 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001875 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1876 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001877 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 PyObject_FREE(_PyUnicode_WSTR(unicode));
1879 _PyUnicode_WSTR(unicode) = NULL;
1880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1881#else
1882 assert(num_surrogates == 0);
1883
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001886 _PyUnicode_UTF8(unicode) = NULL;
1887 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1889#endif
1890 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1891 }
1892 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return 0;
1895}
1896
Alexander Belopolsky40018472011-02-26 01:02:56 +00001897static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001898unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899{
Walter Dörwald16807132007-05-25 13:52:07 +00001900 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_NOT_INTERNED:
1902 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001903
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 case SSTATE_INTERNED_MORTAL:
1905 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001906 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001907 if (PyDict_DelItem(interned, unicode) != 0) {
1908 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1909 NULL);
1910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1915 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001916
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001918 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001919 }
1920
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001921 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001923 }
1924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001925 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 }
1927 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001928 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001931 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932}
1933
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001934#ifdef Py_DEBUG
1935static int
1936unicode_is_singleton(PyObject *unicode)
1937{
1938 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1939 if (unicode == unicode_empty)
1940 return 1;
1941 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1942 {
1943 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1944 if (ch < 256 && unicode_latin1[ch] == unicode)
1945 return 1;
1946 }
1947 return 0;
1948}
1949#endif
1950
Alexander Belopolsky40018472011-02-26 01:02:56 +00001951static int
Victor Stinner488fa492011-12-12 00:01:39 +01001952unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001953{
Victor Stinner488fa492011-12-12 00:01:39 +01001954 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955 if (Py_REFCNT(unicode) != 1)
1956 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001957 if (_PyUnicode_HASH(unicode) != -1)
1958 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001959 if (PyUnicode_CHECK_INTERNED(unicode))
1960 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001961 if (!PyUnicode_CheckExact(unicode))
1962 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001963#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001964 /* singleton refcount is greater than 1 */
1965 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001966#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 return 1;
1968}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001969
Victor Stinnerfe226c02011-10-03 03:52:20 +02001970static int
1971unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1972{
1973 PyObject *unicode;
1974 Py_ssize_t old_length;
1975
1976 assert(p_unicode != NULL);
1977 unicode = *p_unicode;
1978
1979 assert(unicode != NULL);
1980 assert(PyUnicode_Check(unicode));
1981 assert(0 <= length);
1982
Victor Stinner910337b2011-10-03 03:20:16 +02001983 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 old_length = PyUnicode_WSTR_LENGTH(unicode);
1985 else
1986 old_length = PyUnicode_GET_LENGTH(unicode);
1987 if (old_length == length)
1988 return 0;
1989
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001990 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001991 _Py_INCREF_UNICODE_EMPTY();
1992 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001994 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001995 return 0;
1996 }
1997
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001999 PyObject *copy = resize_copy(unicode, length);
2000 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004 }
2005
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002007 PyObject *new_unicode = resize_compact(unicode, length);
2008 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002010 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002013 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002014}
2015
Alexander Belopolsky40018472011-02-26 01:02:56 +00002016int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002018{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 PyObject *unicode;
2020 if (p_unicode == NULL) {
2021 PyErr_BadInternalCall();
2022 return -1;
2023 }
2024 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 {
2027 PyErr_BadInternalCall();
2028 return -1;
2029 }
2030 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002031}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002032
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002033/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002034
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002035 WARNING: The function doesn't copy the terminating null character and
2036 doesn't check the maximum character (may write a latin1 character in an
2037 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002038static void
2039unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2040 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002041{
2042 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2043 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002044 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002045
2046 switch (kind) {
2047 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002048 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002049#ifdef Py_DEBUG
2050 if (PyUnicode_IS_ASCII(unicode)) {
2051 Py_UCS4 maxchar = ucs1lib_find_max_char(
2052 (const Py_UCS1*)str,
2053 (const Py_UCS1*)str + len);
2054 assert(maxchar < 128);
2055 }
2056#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002057 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002058 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002059 }
2060 case PyUnicode_2BYTE_KIND: {
2061 Py_UCS2 *start = (Py_UCS2 *)data + index;
2062 Py_UCS2 *ucs2 = start;
2063 assert(index <= PyUnicode_GET_LENGTH(unicode));
2064
Victor Stinner184252a2012-06-16 02:57:41 +02002065 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 *ucs2 = (Py_UCS2)*str;
2067
2068 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002069 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002070 }
2071 default: {
2072 Py_UCS4 *start = (Py_UCS4 *)data + index;
2073 Py_UCS4 *ucs4 = start;
2074 assert(kind == PyUnicode_4BYTE_KIND);
2075 assert(index <= PyUnicode_GET_LENGTH(unicode));
2076
Victor Stinner184252a2012-06-16 02:57:41 +02002077 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002078 *ucs4 = (Py_UCS4)*str;
2079
2080 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002081 }
2082 }
2083}
2084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085static PyObject*
2086get_latin1_char(unsigned char ch)
2087{
Victor Stinnera464fc12011-10-02 20:39:30 +02002088 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002090 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (!unicode)
2092 return NULL;
2093 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002094 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 unicode_latin1[ch] = unicode;
2096 }
2097 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099}
2100
Victor Stinner985a82a2014-01-03 12:53:47 +01002101static PyObject*
2102unicode_char(Py_UCS4 ch)
2103{
2104 PyObject *unicode;
2105
2106 assert(ch <= MAX_UNICODE);
2107
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002108 if (ch < 256)
2109 return get_latin1_char(ch);
2110
Victor Stinner985a82a2014-01-03 12:53:47 +01002111 unicode = PyUnicode_New(1, ch);
2112 if (unicode == NULL)
2113 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002114
2115 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2116 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002117 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002118 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2120 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2121 }
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
2124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002129 if (u == NULL)
2130 return (PyObject*)_PyUnicode_New(size);
2131
2132 if (size < 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 return PyUnicode_FromWideChar(u, size);
2138}
2139
2140PyObject *
2141PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002143 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 Py_UCS4 maxchar = 0;
2145 Py_ssize_t num_surrogates;
2146
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002147 if (u == NULL && size != 0) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
2151
2152 if (size == -1) {
2153 size = wcslen(u);
2154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 /* If the Unicode data is known at construction time, we can apply
2157 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 /* Single character Unicode objects in the Latin-1 range are
2164 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002165 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return get_latin1_char((unsigned char)*u);
2167
2168 /* If not empty and not single character, copy the Unicode data
2169 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002170 if (find_maxchar_surrogates(u, u + size,
2171 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return NULL;
2173
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!unicode)
2176 return NULL;
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 switch (PyUnicode_KIND(unicode)) {
2179 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002180 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2182 break;
2183 case PyUnicode_2BYTE_KIND:
2184#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002185 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002187 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2189#endif
2190 break;
2191 case PyUnicode_4BYTE_KIND:
2192#if SIZEOF_WCHAR_T == 2
2193 /* This is the only case which has to process surrogates, thus
2194 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002195 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196#else
2197 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002198 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199#endif
2200 break;
2201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002202 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206}
2207
Alexander Belopolsky40018472011-02-26 01:02:56 +00002208PyObject *
2209PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002210{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 if (size < 0) {
2212 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 return NULL;
2215 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002216 if (u != NULL)
2217 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2218 else
2219 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002220}
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224{
2225 size_t size = strlen(u);
2226 if (size > PY_SSIZE_T_MAX) {
2227 PyErr_SetString(PyExc_OverflowError, "input too long");
2228 return NULL;
2229 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002230 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231}
2232
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002233PyObject *
2234_PyUnicode_FromId(_Py_Identifier *id)
2235{
2236 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002237 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2238 strlen(id->string),
2239 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002240 if (!id->object)
2241 return NULL;
2242 PyUnicode_InternInPlace(&id->object);
2243 assert(!id->next);
2244 id->next = static_strings;
2245 static_strings = id;
2246 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002247 return id->object;
2248}
2249
2250void
2251_PyUnicode_ClearStaticStrings()
2252{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002253 _Py_Identifier *tmp, *s = static_strings;
2254 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002255 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002256 tmp = s->next;
2257 s->next = NULL;
2258 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002259 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002260 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002261}
2262
Benjamin Peterson0df54292012-03-26 14:50:32 -04002263/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002264
Victor Stinnerd3f08822012-05-29 12:57:52 +02002265PyObject*
2266_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002267{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002268 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002269 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002270 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002271#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002272 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002273#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002274 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002275 }
Victor Stinner785938e2011-12-11 20:09:03 +01002276 unicode = PyUnicode_New(size, 127);
2277 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002278 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002279 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2280 assert(_PyUnicode_CheckConsistency(unicode, 1));
2281 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002282}
2283
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002284static Py_UCS4
2285kind_maxchar_limit(unsigned int kind)
2286{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002287 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002288 case PyUnicode_1BYTE_KIND:
2289 return 0x80;
2290 case PyUnicode_2BYTE_KIND:
2291 return 0x100;
2292 case PyUnicode_4BYTE_KIND:
2293 return 0x10000;
2294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002295 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 }
2297}
2298
Victor Stinner702c7342011-10-05 13:50:52 +02002299static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002300_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002303 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002304
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 if (size == 0)
2306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002307 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002308 if (size == 1)
2309 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 if (!res)
2314 return NULL;
2315 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002316 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318}
2319
Victor Stinnere57b1c02011-09-28 22:20:48 +02002320static PyObject*
2321_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322{
2323 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002324 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002325
Serhiy Storchaka678db842013-01-26 12:16:36 +02002326 if (size == 0)
2327 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002328 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002329 if (size == 1)
2330 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002331
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 if (!res)
2335 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002338 else {
2339 _PyUnicode_CONVERT_BYTES(
2340 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2341 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002342 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 return res;
2344}
2345
Victor Stinnere57b1c02011-09-28 22:20:48 +02002346static PyObject*
2347_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348{
2349 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351
Serhiy Storchaka678db842013-01-26 12:16:36 +02002352 if (size == 0)
2353 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002354 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002355 if (size == 1)
2356 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 if (!res)
2361 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002362 if (max_char < 256)
2363 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2364 PyUnicode_1BYTE_DATA(res));
2365 else if (max_char < 0x10000)
2366 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2367 PyUnicode_2BYTE_DATA(res));
2368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002370 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return res;
2372}
2373
2374PyObject*
2375PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002377 if (size < 0) {
2378 PyErr_SetString(PyExc_ValueError, "size must be positive");
2379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002383 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002385 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002387 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002388 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002389 PyErr_SetString(PyExc_SystemError, "invalid kind");
2390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392}
2393
Victor Stinnerece58de2012-04-23 23:36:38 +02002394Py_UCS4
2395_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396{
2397 enum PyUnicode_Kind kind;
2398 void *startptr, *endptr;
2399
2400 assert(PyUnicode_IS_READY(unicode));
2401 assert(0 <= start);
2402 assert(end <= PyUnicode_GET_LENGTH(unicode));
2403 assert(start <= end);
2404
2405 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2406 return PyUnicode_MAX_CHAR_VALUE(unicode);
2407
2408 if (start == end)
2409 return 127;
2410
Victor Stinner94d558b2012-04-27 22:26:58 +02002411 if (PyUnicode_IS_ASCII(unicode))
2412 return 127;
2413
Victor Stinnerece58de2012-04-23 23:36:38 +02002414 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002415 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002416 endptr = (char *)startptr + end * kind;
2417 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002418 switch(kind) {
2419 case PyUnicode_1BYTE_KIND:
2420 return ucs1lib_find_max_char(startptr, endptr);
2421 case PyUnicode_2BYTE_KIND:
2422 return ucs2lib_find_max_char(startptr, endptr);
2423 case PyUnicode_4BYTE_KIND:
2424 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002425 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002426 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002427 }
2428}
2429
Victor Stinner25a4b292011-10-06 12:31:55 +02002430/* Ensure that a string uses the most efficient storage, if it is not the
2431 case: create a new string with of the right kind. Write NULL into *p_unicode
2432 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002433static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002434unicode_adjust_maxchar(PyObject **p_unicode)
2435{
2436 PyObject *unicode, *copy;
2437 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002438 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 unsigned int kind;
2440
2441 assert(p_unicode != NULL);
2442 unicode = *p_unicode;
2443 assert(PyUnicode_IS_READY(unicode));
2444 if (PyUnicode_IS_ASCII(unicode))
2445 return;
2446
2447 len = PyUnicode_GET_LENGTH(unicode);
2448 kind = PyUnicode_KIND(unicode);
2449 if (kind == PyUnicode_1BYTE_KIND) {
2450 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002451 max_char = ucs1lib_find_max_char(u, u + len);
2452 if (max_char >= 128)
2453 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002454 }
2455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002457 max_char = ucs2lib_find_max_char(u, u + len);
2458 if (max_char >= 256)
2459 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 }
2461 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002462 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002464 max_char = ucs4lib_find_max_char(u, u + len);
2465 if (max_char >= 0x10000)
2466 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002467 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002468 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002469 if (copy != NULL)
2470 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002471 Py_DECREF(unicode);
2472 *p_unicode = copy;
2473}
2474
Victor Stinner034f6cf2011-09-30 02:26:44 +02002475PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002476_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477{
Victor Stinner87af4f22011-11-21 23:03:47 +01002478 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002479 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002480
Victor Stinner034f6cf2011-09-30 02:26:44 +02002481 if (!PyUnicode_Check(unicode)) {
2482 PyErr_BadInternalCall();
2483 return NULL;
2484 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002485 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002486 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002487
Victor Stinner87af4f22011-11-21 23:03:47 +01002488 length = PyUnicode_GET_LENGTH(unicode);
2489 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490 if (!copy)
2491 return NULL;
2492 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2493
Christian Heimesf051e432016-09-13 20:22:02 +02002494 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002495 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002496 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002497 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500
Victor Stinnerbc603d12011-10-02 01:00:40 +02002501/* Widen Unicode objects to larger buffers. Don't write terminating null
2502 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503
2504void*
2505_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2506{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002507 Py_ssize_t len;
2508 void *result;
2509 unsigned int skind;
2510
Benjamin Petersonbac79492012-01-14 13:34:47 -05002511 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002512 return NULL;
2513
2514 len = PyUnicode_GET_LENGTH(s);
2515 skind = PyUnicode_KIND(s);
2516 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002517 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 return NULL;
2519 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002520 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 if (!result)
2524 return PyErr_NoMemory();
2525 assert(skind == PyUnicode_1BYTE_KIND);
2526 _PyUnicode_CONVERT_BYTES(
2527 Py_UCS1, Py_UCS2,
2528 PyUnicode_1BYTE_DATA(s),
2529 PyUnicode_1BYTE_DATA(s) + len,
2530 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002533 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 if (!result)
2535 return PyErr_NoMemory();
2536 if (skind == PyUnicode_2BYTE_KIND) {
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS2, Py_UCS4,
2539 PyUnicode_2BYTE_DATA(s),
2540 PyUnicode_2BYTE_DATA(s) + len,
2541 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 else {
2544 assert(skind == PyUnicode_1BYTE_KIND);
2545 _PyUnicode_CONVERT_BYTES(
2546 Py_UCS1, Py_UCS4,
2547 PyUnicode_1BYTE_DATA(s),
2548 PyUnicode_1BYTE_DATA(s) + len,
2549 result);
2550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552 default:
2553 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 }
Victor Stinner01698042011-10-04 00:04:26 +02002555 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 return NULL;
2557}
2558
2559static Py_UCS4*
2560as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2561 int copy_null)
2562{
2563 int kind;
2564 void *data;
2565 Py_ssize_t len, targetlen;
2566 if (PyUnicode_READY(string) == -1)
2567 return NULL;
2568 kind = PyUnicode_KIND(string);
2569 data = PyUnicode_DATA(string);
2570 len = PyUnicode_GET_LENGTH(string);
2571 targetlen = len;
2572 if (copy_null)
2573 targetlen++;
2574 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 if (!target) {
2577 PyErr_NoMemory();
2578 return NULL;
2579 }
2580 }
2581 else {
2582 if (targetsize < targetlen) {
2583 PyErr_Format(PyExc_SystemError,
2584 "string is longer than the buffer");
2585 if (copy_null && 0 < targetsize)
2586 target[0] = 0;
2587 return NULL;
2588 }
2589 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002590 if (kind == PyUnicode_1BYTE_KIND) {
2591 Py_UCS1 *start = (Py_UCS1 *) data;
2592 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002594 else if (kind == PyUnicode_2BYTE_KIND) {
2595 Py_UCS2 *start = (Py_UCS2 *) data;
2596 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2597 }
2598 else {
2599 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002600 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 if (copy_null)
2603 target[len] = 0;
2604 return target;
2605}
2606
2607Py_UCS4*
2608PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2609 int copy_null)
2610{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002611 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 PyErr_BadInternalCall();
2613 return NULL;
2614 }
2615 return as_ucs4(string, target, targetsize, copy_null);
2616}
2617
2618Py_UCS4*
2619PyUnicode_AsUCS4Copy(PyObject *string)
2620{
2621 return as_ucs4(string, NULL, 0, 1);
2622}
2623
Victor Stinner15a11362012-10-06 23:48:20 +02002624/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002625 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2626 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2627#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629static int
2630unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2631 Py_ssize_t width, Py_ssize_t precision)
2632{
2633 Py_ssize_t length, fill, arglen;
2634 Py_UCS4 maxchar;
2635
2636 if (PyUnicode_READY(str) == -1)
2637 return -1;
2638
2639 length = PyUnicode_GET_LENGTH(str);
2640 if ((precision == -1 || precision >= length)
2641 && width <= length)
2642 return _PyUnicodeWriter_WriteStr(writer, str);
2643
2644 if (precision != -1)
2645 length = Py_MIN(precision, length);
2646
2647 arglen = Py_MAX(length, width);
2648 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2649 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2650 else
2651 maxchar = writer->maxchar;
2652
2653 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2654 return -1;
2655
2656 if (width > length) {
2657 fill = width - length;
2658 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2659 return -1;
2660 writer->pos += fill;
2661 }
2662
2663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2664 str, 0, length);
2665 writer->pos += length;
2666 return 0;
2667}
2668
2669static int
Victor Stinner998b8062018-09-12 00:23:25 +02002670unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002671 Py_ssize_t width, Py_ssize_t precision)
2672{
2673 /* UTF-8 */
2674 Py_ssize_t length;
2675 PyObject *unicode;
2676 int res;
2677
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002678 if (precision == -1) {
2679 length = strlen(str);
2680 }
2681 else {
2682 length = 0;
2683 while (length < precision && str[length]) {
2684 length++;
2685 }
2686 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002687 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2688 if (unicode == NULL)
2689 return -1;
2690
2691 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2692 Py_DECREF(unicode);
2693 return res;
2694}
2695
Victor Stinner96865452011-03-01 23:44:09 +00002696static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002697unicode_fromformat_arg(_PyUnicodeWriter *writer,
2698 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002699{
Victor Stinnere215d962012-10-06 23:03:36 +02002700 const char *p;
2701 Py_ssize_t len;
2702 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002703 Py_ssize_t width;
2704 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002705 int longflag;
2706 int longlongflag;
2707 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002708 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002709
2710 p = f;
2711 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002712 zeropad = 0;
2713 if (*f == '0') {
2714 zeropad = 1;
2715 f++;
2716 }
Victor Stinner96865452011-03-01 23:44:09 +00002717
2718 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 width = -1;
2720 if (Py_ISDIGIT((unsigned)*f)) {
2721 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002722 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002723 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002724 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002725 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002727 return NULL;
2728 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002730 f++;
2731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732 }
2733 precision = -1;
2734 if (*f == '.') {
2735 f++;
2736 if (Py_ISDIGIT((unsigned)*f)) {
2737 precision = (*f - '0');
2738 f++;
2739 while (Py_ISDIGIT((unsigned)*f)) {
2740 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2741 PyErr_SetString(PyExc_ValueError,
2742 "precision too big");
2743 return NULL;
2744 }
2745 precision = (precision * 10) + (*f - '0');
2746 f++;
2747 }
2748 }
Victor Stinner96865452011-03-01 23:44:09 +00002749 if (*f == '%') {
2750 /* "%.3%s" => f points to "3" */
2751 f--;
2752 }
2753 }
2754 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002755 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002756 f--;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758
2759 /* Handle %ld, %lu, %lld and %llu. */
2760 longflag = 0;
2761 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002762 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002763 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002764 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002765 longflag = 1;
2766 ++f;
2767 }
Victor Stinner96865452011-03-01 23:44:09 +00002768 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002769 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002770 longlongflag = 1;
2771 f += 2;
2772 }
Victor Stinner96865452011-03-01 23:44:09 +00002773 }
2774 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002775 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002776 size_tflag = 1;
2777 ++f;
2778 }
Victor Stinnere215d962012-10-06 23:03:36 +02002779
2780 if (f[1] == '\0')
2781 writer->overallocate = 0;
2782
2783 switch (*f) {
2784 case 'c':
2785 {
2786 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002787 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002788 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002789 "character argument not in range(0x110000)");
2790 return NULL;
2791 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002792 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002793 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 break;
2795 }
2796
2797 case 'i':
2798 case 'd':
2799 case 'u':
2800 case 'x':
2801 {
2802 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002803 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002805
2806 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002807 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002808 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002809 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002810 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002811 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002812 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002813 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002814 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002815 va_arg(*vargs, size_t));
2816 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002817 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002818 va_arg(*vargs, unsigned int));
2819 }
2820 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002821 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002822 }
2823 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002825 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002826 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002827 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002828 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002829 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002830 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002831 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002832 va_arg(*vargs, Py_ssize_t));
2833 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002834 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002835 va_arg(*vargs, int));
2836 }
2837 assert(len >= 0);
2838
Victor Stinnere215d962012-10-06 23:03:36 +02002839 if (precision < len)
2840 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841
2842 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2844 return NULL;
2845
Victor Stinnere215d962012-10-06 23:03:36 +02002846 if (width > precision) {
2847 Py_UCS4 fillchar;
2848 fill = width - precision;
2849 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002850 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2851 return NULL;
2852 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 }
Victor Stinner15a11362012-10-06 23:48:20 +02002854 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002855 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002856 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2857 return NULL;
2858 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002859 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002860
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864 }
2865
2866 case 'p':
2867 {
2868 char number[MAX_LONG_LONG_CHARS];
2869
2870 len = sprintf(number, "%p", va_arg(*vargs, void*));
2871 assert(len >= 0);
2872
2873 /* %p is ill-defined: ensure leading 0x. */
2874 if (number[1] == 'X')
2875 number[1] = 'x';
2876 else if (number[1] != 'x') {
2877 memmove(number + 2, number,
2878 strlen(number) + 1);
2879 number[0] = '0';
2880 number[1] = 'x';
2881 len += 2;
2882 }
2883
Victor Stinner4a587072013-11-19 12:54:53 +01002884 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002885 return NULL;
2886 break;
2887 }
2888
2889 case 's':
2890 {
2891 /* UTF-8 */
2892 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002893 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002894 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002895 break;
2896 }
2897
2898 case 'U':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 assert(obj && _PyUnicode_CHECK(obj));
2902
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002904 return NULL;
2905 break;
2906 }
2907
2908 case 'V':
2909 {
2910 PyObject *obj = va_arg(*vargs, PyObject *);
2911 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002912 if (obj) {
2913 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return NULL;
2916 }
2917 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002919 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 }
2922 break;
2923 }
2924
2925 case 'S':
2926 {
2927 PyObject *obj = va_arg(*vargs, PyObject *);
2928 PyObject *str;
2929 assert(obj);
2930 str = PyObject_Str(obj);
2931 if (!str)
2932 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002933 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002934 Py_DECREF(str);
2935 return NULL;
2936 }
2937 Py_DECREF(str);
2938 break;
2939 }
2940
2941 case 'R':
2942 {
2943 PyObject *obj = va_arg(*vargs, PyObject *);
2944 PyObject *repr;
2945 assert(obj);
2946 repr = PyObject_Repr(obj);
2947 if (!repr)
2948 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002950 Py_DECREF(repr);
2951 return NULL;
2952 }
2953 Py_DECREF(repr);
2954 break;
2955 }
2956
2957 case 'A':
2958 {
2959 PyObject *obj = va_arg(*vargs, PyObject *);
2960 PyObject *ascii;
2961 assert(obj);
2962 ascii = PyObject_ASCII(obj);
2963 if (!ascii)
2964 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002965 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002966 Py_DECREF(ascii);
2967 return NULL;
2968 }
2969 Py_DECREF(ascii);
2970 break;
2971 }
2972
2973 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002974 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002975 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002976 break;
2977
2978 default:
2979 /* if we stumble upon an unknown formatting code, copy the rest
2980 of the format string to the output string. (we cannot just
2981 skip the code, since there's no way to know what's in the
2982 argument list) */
2983 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002984 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002985 return NULL;
2986 f = p+len;
2987 return f;
2988 }
2989
2990 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002991 return f;
2992}
2993
Walter Dörwaldd2034312007-05-18 16:29:38 +00002994PyObject *
2995PyUnicode_FromFormatV(const char *format, va_list vargs)
2996{
Victor Stinnere215d962012-10-06 23:03:36 +02002997 va_list vargs2;
2998 const char *f;
2999 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000
Victor Stinner8f674cc2013-04-17 23:02:17 +02003001 _PyUnicodeWriter_Init(&writer);
3002 writer.min_length = strlen(format) + 100;
3003 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003004
Benjamin Peterson0c212142016-09-20 20:39:33 -07003005 // Copy varags to be able to pass a reference to a subfunction.
3006 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003007
3008 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003009 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003010 f = unicode_fromformat_arg(&writer, f, &vargs2);
3011 if (f == NULL)
3012 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003015 const char *p;
3016 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003017
Victor Stinnere215d962012-10-06 23:03:36 +02003018 p = f;
3019 do
3020 {
3021 if ((unsigned char)*p > 127) {
3022 PyErr_Format(PyExc_ValueError,
3023 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3024 "string, got a non-ASCII byte: 0x%02x",
3025 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003026 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003027 }
3028 p++;
3029 }
3030 while (*p != '\0' && *p != '%');
3031 len = p - f;
3032
3033 if (*p == '\0')
3034 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003035
3036 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003037 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003038
3039 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003041 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003042 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003043 return _PyUnicodeWriter_Finish(&writer);
3044
3045 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003046 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003047 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003048 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003049}
3050
Walter Dörwaldd2034312007-05-18 16:29:38 +00003051PyObject *
3052PyUnicode_FromFormat(const char *format, ...)
3053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003054 PyObject* ret;
3055 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056
3057#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003059#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003061#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003062 ret = PyUnicode_FromFormatV(format, vargs);
3063 va_end(vargs);
3064 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003065}
3066
Serhiy Storchakac46db922018-10-23 22:58:24 +03003067static Py_ssize_t
3068unicode_get_widechar_size(PyObject *unicode)
3069{
3070 Py_ssize_t res;
3071
3072 assert(unicode != NULL);
3073 assert(_PyUnicode_CHECK(unicode));
3074
3075 if (_PyUnicode_WSTR(unicode) != NULL) {
3076 return PyUnicode_WSTR_LENGTH(unicode);
3077 }
3078 assert(PyUnicode_IS_READY(unicode));
3079
3080 res = _PyUnicode_LENGTH(unicode);
3081#if SIZEOF_WCHAR_T == 2
3082 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3083 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3084 const Py_UCS4 *end = s + res;
3085 for (; s < end; ++s) {
3086 if (*s > 0xFFFF) {
3087 ++res;
3088 }
3089 }
3090 }
3091#endif
3092 return res;
3093}
3094
3095static void
3096unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3097{
3098 const wchar_t *wstr;
3099
3100 assert(unicode != NULL);
3101 assert(_PyUnicode_CHECK(unicode));
3102
3103 wstr = _PyUnicode_WSTR(unicode);
3104 if (wstr != NULL) {
3105 memcpy(w, wstr, size * sizeof(wchar_t));
3106 return;
3107 }
3108 assert(PyUnicode_IS_READY(unicode));
3109
3110 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3111 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3112 for (; size--; ++s, ++w) {
3113 *w = *s;
3114 }
3115 }
3116 else {
3117#if SIZEOF_WCHAR_T == 4
3118 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3119 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3120 for (; size--; ++s, ++w) {
3121 *w = *s;
3122 }
3123#else
3124 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3125 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3126 for (; size--; ++s, ++w) {
3127 Py_UCS4 ch = *s;
3128 if (ch > 0xFFFF) {
3129 assert(ch <= MAX_UNICODE);
3130 /* encode surrogate pair in this case */
3131 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3132 if (!size--)
3133 break;
3134 *w = Py_UNICODE_LOW_SURROGATE(ch);
3135 }
3136 else {
3137 *w = ch;
3138 }
3139 }
3140#endif
3141 }
3142}
3143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144#ifdef HAVE_WCHAR_H
3145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003147
Victor Stinnerd88d9832011-09-06 02:00:05 +02003148 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003149 character) required to convert the unicode object. Ignore size argument.
3150
Victor Stinnerd88d9832011-09-06 02:00:05 +02003151 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003152 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003153 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003154Py_ssize_t
3155PyUnicode_AsWideChar(PyObject *unicode,
3156 wchar_t *w,
3157 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003158{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003159 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003161 if (unicode == NULL) {
3162 PyErr_BadInternalCall();
3163 return -1;
3164 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003168 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003169
3170 res = unicode_get_widechar_size(unicode);
3171 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003172 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003173 }
3174
3175 if (size > res) {
3176 size = res + 1;
3177 }
3178 else {
3179 res = size;
3180 }
3181 unicode_copy_as_widechar(unicode, w, size);
3182 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003183}
3184
Victor Stinner137c34c2010-09-29 10:25:54 +00003185wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003186PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003187 Py_ssize_t *size)
3188{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003190 Py_ssize_t buflen;
3191
3192 if (unicode == NULL) {
3193 PyErr_BadInternalCall();
3194 return NULL;
3195 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003198 return NULL;
3199 }
3200
Serhiy Storchakac46db922018-10-23 22:58:24 +03003201 buflen = unicode_get_widechar_size(unicode);
3202 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003203 if (buffer == NULL) {
3204 PyErr_NoMemory();
3205 return NULL;
3206 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3208 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003209 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003210 }
3211 else if (wcslen(buffer) != (size_t)buflen) {
3212 PyMem_FREE(buffer);
3213 PyErr_SetString(PyExc_ValueError,
3214 "embedded null character");
3215 return NULL;
3216 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003217 return buffer;
3218}
3219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003220#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221
Alexander Belopolsky40018472011-02-26 01:02:56 +00003222PyObject *
3223PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003224{
Victor Stinner8faf8212011-12-08 22:14:11 +01003225 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 PyErr_SetString(PyExc_ValueError,
3227 "chr() arg not in range(0x110000)");
3228 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003229 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003230
Victor Stinner985a82a2014-01-03 12:53:47 +01003231 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003235PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003237 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003239 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003240 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003241 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_INCREF(obj);
3243 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003244 }
3245 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 /* For a Unicode subtype that's not a Unicode object,
3247 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003248 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003249 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003250 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003251 "Can't convert '%.100s' object to str implicitly",
3252 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003253 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003257PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003260{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003261 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003262 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 PyErr_BadInternalCall();
3266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003268
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003269 /* Decoding bytes objects is the most common case and should be fast */
3270 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003271 if (PyBytes_GET_SIZE(obj) == 0) {
3272 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3273 return NULL;
3274 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003275 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003276 }
3277 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003278 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3279 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003280 }
3281
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003282 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 PyErr_SetString(PyExc_TypeError,
3284 "decoding str is not supported");
3285 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003286 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003287
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003288 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3289 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003291 "decoding to str: need a bytes-like object, %.80s found",
3292 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003293 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003294 }
Tim Petersced69f82003-09-16 20:30:58 +00003295
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003297 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003298 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3299 return NULL;
3300 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003301 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003303
Serhiy Storchaka05997252013-01-26 12:14:02 +02003304 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003305 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003306 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307}
3308
Victor Stinnerebe17e02016-10-12 13:57:45 +02003309/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3310 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3311 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003312int
3313_Py_normalize_encoding(const char *encoding,
3314 char *lower,
3315 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 char *l;
3319 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003320 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321
Victor Stinner942889a2016-09-05 15:40:10 -07003322 assert(encoding != NULL);
3323
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003324 e = encoding;
3325 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003326 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003327 punct = 0;
3328 while (1) {
3329 char c = *e;
3330 if (c == 0) {
3331 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
Victor Stinner942889a2016-09-05 15:40:10 -07003333
3334 if (Py_ISALNUM(c) || c == '.') {
3335 if (punct && l != lower) {
3336 if (l == l_end) {
3337 return 0;
3338 }
3339 *l++ = '_';
3340 }
3341 punct = 0;
3342
3343 if (l == l_end) {
3344 return 0;
3345 }
3346 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003347 }
3348 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003349 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003350 }
Victor Stinner942889a2016-09-05 15:40:10 -07003351
3352 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003353 }
3354 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003355 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003356}
3357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358PyObject *
3359PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003360 Py_ssize_t size,
3361 const char *encoding,
3362 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003363{
3364 PyObject *buffer = NULL, *unicode;
3365 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003366 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3367
Victor Stinner22eb6892019-06-26 00:51:05 +02003368 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3369 return NULL;
3370 }
3371
Victor Stinnered076ed2019-06-26 01:49:32 +02003372 if (size == 0) {
3373 _Py_RETURN_UNICODE_EMPTY();
3374 }
3375
Victor Stinner942889a2016-09-05 15:40:10 -07003376 if (encoding == NULL) {
3377 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3378 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003379
Fred Drakee4315f52000-05-09 19:53:39 +00003380 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003381 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3382 char *lower = buflower;
3383
3384 /* Fast paths */
3385 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3386 lower += 3;
3387 if (*lower == '_') {
3388 /* Match "utf8" and "utf_8" */
3389 lower++;
3390 }
3391
3392 if (lower[0] == '8' && lower[1] == 0) {
3393 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3394 }
3395 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3396 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3397 }
3398 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3399 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3400 }
3401 }
3402 else {
3403 if (strcmp(lower, "ascii") == 0
3404 || strcmp(lower, "us_ascii") == 0) {
3405 return PyUnicode_DecodeASCII(s, size, errors);
3406 }
Steve Dowercc16be82016-09-08 10:35:16 -07003407 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003408 else if (strcmp(lower, "mbcs") == 0) {
3409 return PyUnicode_DecodeMBCS(s, size, errors);
3410 }
3411 #endif
3412 else if (strcmp(lower, "latin1") == 0
3413 || strcmp(lower, "latin_1") == 0
3414 || strcmp(lower, "iso_8859_1") == 0
3415 || strcmp(lower, "iso8859_1") == 0) {
3416 return PyUnicode_DecodeLatin1(s, size, errors);
3417 }
3418 }
Victor Stinner37296e82010-06-10 13:36:23 +00003419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
3421 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003422 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003423 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003424 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003425 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 if (buffer == NULL)
3427 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003428 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 if (unicode == NULL)
3430 goto onError;
3431 if (!PyUnicode_Check(unicode)) {
3432 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003433 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003434 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003435 encoding,
3436 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 Py_DECREF(unicode);
3438 goto onError;
3439 }
3440 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003441 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 Py_XDECREF(buffer);
3445 return NULL;
3446}
3447
Alexander Belopolsky40018472011-02-26 01:02:56 +00003448PyObject *
3449PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003450 const char *encoding,
3451 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003452{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453 if (!PyUnicode_Check(unicode)) {
3454 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003455 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456 }
3457
Serhiy Storchaka00939072016-10-27 21:05:49 +03003458 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3459 "PyUnicode_AsDecodedObject() is deprecated; "
3460 "use PyCodec_Decode() to decode from str", 1) < 0)
3461 return NULL;
3462
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003465
3466 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003467 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003468}
3469
Alexander Belopolsky40018472011-02-26 01:02:56 +00003470PyObject *
3471PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003472 const char *encoding,
3473 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474{
3475 PyObject *v;
3476
3477 if (!PyUnicode_Check(unicode)) {
3478 PyErr_BadArgument();
3479 goto onError;
3480 }
3481
Serhiy Storchaka00939072016-10-27 21:05:49 +03003482 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3483 "PyUnicode_AsDecodedUnicode() is deprecated; "
3484 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3485 return NULL;
3486
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
3490 /* Decode via the codec registry */
3491 v = PyCodec_Decode(unicode, encoding, errors);
3492 if (v == NULL)
3493 goto onError;
3494 if (!PyUnicode_Check(v)) {
3495 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003496 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003497 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003498 encoding,
3499 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500 Py_DECREF(v);
3501 goto onError;
3502 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003503 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506 return NULL;
3507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 Py_ssize_t size,
3512 const char *encoding,
3513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514{
3515 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003516
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003517 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3521 Py_DECREF(unicode);
3522 return v;
3523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003529{
3530 PyObject *v;
3531
3532 if (!PyUnicode_Check(unicode)) {
3533 PyErr_BadArgument();
3534 goto onError;
3535 }
3536
Serhiy Storchaka00939072016-10-27 21:05:49 +03003537 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3538 "PyUnicode_AsEncodedObject() is deprecated; "
3539 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3540 "or PyCodec_Encode() for generic encoding", 1) < 0)
3541 return NULL;
3542
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003543 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003545
3546 /* Encode via the codec registry */
3547 v = PyCodec_Encode(unicode, encoding, errors);
3548 if (v == NULL)
3549 goto onError;
3550 return v;
3551
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003553 return NULL;
3554}
3555
Victor Stinner1b579672011-12-17 05:47:23 +01003556
Victor Stinner2cba6b82018-01-10 22:46:15 +01003557static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003558unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003559 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003560{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003561 Py_ssize_t wlen;
3562 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3563 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003564 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003566
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003567 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003568 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003570 return NULL;
3571 }
3572
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003573 char *str;
3574 size_t error_pos;
3575 const char *reason;
3576 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003577 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003578 PyMem_Free(wstr);
3579
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003580 if (res != 0) {
3581 if (res == -2) {
3582 PyObject *exc;
3583 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3584 "locale", unicode,
3585 (Py_ssize_t)error_pos,
3586 (Py_ssize_t)(error_pos+1),
3587 reason);
3588 if (exc != NULL) {
3589 PyCodec_StrictErrors(exc);
3590 Py_DECREF(exc);
3591 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003593 else if (res == -3) {
3594 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3595 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003596 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003597 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003598 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003599 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003600 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003602 PyObject *bytes = PyBytes_FromString(str);
3603 PyMem_RawFree(str);
3604 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605}
3606
Victor Stinnerad158722010-10-27 00:25:46 +00003607PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003608PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3609{
Victor Stinner709d23d2019-05-02 14:56:30 -04003610 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3611 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003612}
3613
3614PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003615PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003616{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003617 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003618 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003619 return unicode_encode_utf8(unicode,
3620 interp->fs_codec.error_handler,
3621 interp->fs_codec.errors);
3622 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003623#ifndef _Py_FORCE_UTF8_FS_ENCODING
3624 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003625 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003626 interp->fs_codec.encoding,
3627 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003628 }
Victor Stinnerad158722010-10-27 00:25:46 +00003629#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003630 else {
3631 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3632 machinery is not ready and so cannot be used:
3633 use wcstombs() in this case. */
3634 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3635 assert(filesystem_errors != NULL);
3636 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3637 assert(errors != _Py_ERROR_UNKNOWN);
3638#ifdef _Py_FORCE_UTF8_FS_ENCODING
3639 return unicode_encode_utf8(unicode, errors, NULL);
3640#else
3641 return unicode_encode_locale(unicode, errors, 0);
3642#endif
3643 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003644}
3645
Alexander Belopolsky40018472011-02-26 01:02:56 +00003646PyObject *
3647PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003648 const char *encoding,
3649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650{
3651 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003652 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003653
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 if (!PyUnicode_Check(unicode)) {
3655 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 }
Fred Drakee4315f52000-05-09 19:53:39 +00003658
Victor Stinner22eb6892019-06-26 00:51:05 +02003659 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3660 return NULL;
3661 }
3662
Victor Stinner942889a2016-09-05 15:40:10 -07003663 if (encoding == NULL) {
3664 return _PyUnicode_AsUTF8String(unicode, errors);
3665 }
3666
Fred Drakee4315f52000-05-09 19:53:39 +00003667 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003668 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3669 char *lower = buflower;
3670
3671 /* Fast paths */
3672 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3673 lower += 3;
3674 if (*lower == '_') {
3675 /* Match "utf8" and "utf_8" */
3676 lower++;
3677 }
3678
3679 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003681 }
3682 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3683 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3684 }
3685 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3686 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3687 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003688 }
Victor Stinner942889a2016-09-05 15:40:10 -07003689 else {
3690 if (strcmp(lower, "ascii") == 0
3691 || strcmp(lower, "us_ascii") == 0) {
3692 return _PyUnicode_AsASCIIString(unicode, errors);
3693 }
Steve Dowercc16be82016-09-08 10:35:16 -07003694#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003695 else if (strcmp(lower, "mbcs") == 0) {
3696 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3697 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003698#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003699 else if (strcmp(lower, "latin1") == 0 ||
3700 strcmp(lower, "latin_1") == 0 ||
3701 strcmp(lower, "iso_8859_1") == 0 ||
3702 strcmp(lower, "iso8859_1") == 0) {
3703 return _PyUnicode_AsLatin1String(unicode, errors);
3704 }
3705 }
Victor Stinner37296e82010-06-10 13:36:23 +00003706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707
3708 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003709 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003711 return NULL;
3712
3713 /* The normal path */
3714 if (PyBytes_Check(v))
3715 return v;
3716
3717 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003718 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003719 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003720 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003721
3722 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003723 "encoder %s returned bytearray instead of bytes; "
3724 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003725 encoding);
3726 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003727 Py_DECREF(v);
3728 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003729 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003731 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3732 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003733 Py_DECREF(v);
3734 return b;
3735 }
3736
3737 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003738 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003739 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003740 encoding,
3741 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003742 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003743 return NULL;
3744}
3745
Alexander Belopolsky40018472011-02-26 01:02:56 +00003746PyObject *
3747PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003748 const char *encoding,
3749 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003750{
3751 PyObject *v;
3752
3753 if (!PyUnicode_Check(unicode)) {
3754 PyErr_BadArgument();
3755 goto onError;
3756 }
3757
Serhiy Storchaka00939072016-10-27 21:05:49 +03003758 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3759 "PyUnicode_AsEncodedUnicode() is deprecated; "
3760 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3761 return NULL;
3762
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003763 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003765
3766 /* Encode via the codec registry */
3767 v = PyCodec_Encode(unicode, encoding, errors);
3768 if (v == NULL)
3769 goto onError;
3770 if (!PyUnicode_Check(v)) {
3771 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003772 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003773 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003774 encoding,
3775 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003776 Py_DECREF(v);
3777 goto onError;
3778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003780
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 return NULL;
3783}
3784
Victor Stinner2cba6b82018-01-10 22:46:15 +01003785static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003786unicode_decode_locale(const char *str, Py_ssize_t len,
3787 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003788{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003789 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3790 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003791 return NULL;
3792 }
3793
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003794 wchar_t *wstr;
3795 size_t wlen;
3796 const char *reason;
3797 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003798 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003799 if (res != 0) {
3800 if (res == -2) {
3801 PyObject *exc;
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3803 "locale", str, len,
3804 (Py_ssize_t)wlen,
3805 (Py_ssize_t)(wlen + 1),
3806 reason);
3807 if (exc != NULL) {
3808 PyCodec_StrictErrors(exc);
3809 Py_DECREF(exc);
3810 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003811 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003812 else if (res == -3) {
3813 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3814 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003815 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003816 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003817 }
Victor Stinner2f197072011-12-17 07:08:30 +01003818 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003819 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003820
3821 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3822 PyMem_RawFree(wstr);
3823 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824}
3825
3826PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003827PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3828 const char *errors)
3829{
Victor Stinner709d23d2019-05-02 14:56:30 -04003830 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3831 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003832}
3833
3834PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003835PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003836{
3837 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003838 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3839 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003840}
3841
3842
3843PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003844PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003846 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3847}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848
Christian Heimes5894ba72007-11-04 11:43:14 +00003849PyObject*
3850PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3851{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003852 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003853 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 return unicode_decode_utf8(s, size,
3855 interp->fs_codec.error_handler,
3856 interp->fs_codec.errors,
3857 NULL);
3858 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003859#ifndef _Py_FORCE_UTF8_FS_ENCODING
3860 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003861 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003862 interp->fs_codec.encoding,
3863 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003864 }
Victor Stinnerad158722010-10-27 00:25:46 +00003865#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003866 else {
3867 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3868 machinery is not ready and so cannot be used:
3869 use mbstowcs() in this case. */
3870 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3871 assert(filesystem_errors != NULL);
3872 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3873 assert(errors != _Py_ERROR_UNKNOWN);
3874#ifdef _Py_FORCE_UTF8_FS_ENCODING
3875 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3876#else
3877 return unicode_decode_locale(s, size, errors, 0);
3878#endif
3879 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003880}
3881
Martin v. Löwis011e8422009-05-05 04:43:17 +00003882
3883int
3884PyUnicode_FSConverter(PyObject* arg, void* addr)
3885{
Brett Cannonec6ce872016-09-06 15:50:29 -07003886 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887 PyObject *output = NULL;
3888 Py_ssize_t size;
3889 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003890 if (arg == NULL) {
3891 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003892 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return 1;
3894 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003895 path = PyOS_FSPath(arg);
3896 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003897 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003899 if (PyBytes_Check(path)) {
3900 output = path;
3901 }
3902 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3903 output = PyUnicode_EncodeFSDefault(path);
3904 Py_DECREF(path);
3905 if (!output) {
3906 return 0;
3907 }
3908 assert(PyBytes_Check(output));
3909 }
3910
Victor Stinner0ea2a462010-04-30 00:22:08 +00003911 size = PyBytes_GET_SIZE(output);
3912 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003913 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003914 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003915 Py_DECREF(output);
3916 return 0;
3917 }
3918 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003919 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003920}
3921
3922
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003923int
3924PyUnicode_FSDecoder(PyObject* arg, void* addr)
3925{
Brett Cannona5711202016-09-06 19:36:01 -07003926 int is_buffer = 0;
3927 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003928 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929 if (arg == NULL) {
3930 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003931 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 return 1;
3933 }
Brett Cannona5711202016-09-06 19:36:01 -07003934
3935 is_buffer = PyObject_CheckBuffer(arg);
3936 if (!is_buffer) {
3937 path = PyOS_FSPath(arg);
3938 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003939 return 0;
3940 }
Brett Cannona5711202016-09-06 19:36:01 -07003941 }
3942 else {
3943 path = arg;
3944 Py_INCREF(arg);
3945 }
3946
3947 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003948 output = path;
3949 }
3950 else if (PyBytes_Check(path) || is_buffer) {
3951 PyObject *path_bytes = NULL;
3952
3953 if (!PyBytes_Check(path) &&
3954 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003955 "path should be string, bytes, or os.PathLike, not %.200s",
3956 Py_TYPE(arg)->tp_name)) {
3957 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003958 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003959 }
3960 path_bytes = PyBytes_FromObject(path);
3961 Py_DECREF(path);
3962 if (!path_bytes) {
3963 return 0;
3964 }
3965 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3966 PyBytes_GET_SIZE(path_bytes));
3967 Py_DECREF(path_bytes);
3968 if (!output) {
3969 return 0;
3970 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003972 else {
3973 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003974 "path should be string, bytes, or os.PathLike, not %.200s",
3975 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003976 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003977 return 0;
3978 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003979 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003980 Py_DECREF(output);
3981 return 0;
3982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003984 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003985 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003986 Py_DECREF(output);
3987 return 0;
3988 }
3989 *(PyObject**)addr = output;
3990 return Py_CLEANUP_SUPPORTED;
3991}
3992
3993
Inada Naoki02a4d572020-02-27 13:48:59 +09003994static int unicode_fill_utf8(PyObject *unicode);
3995
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003996const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003997PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003998{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003999 if (!PyUnicode_Check(unicode)) {
4000 PyErr_BadArgument();
4001 return NULL;
4002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004003 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004004 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004006 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004007 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 return NULL;
4009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010 }
4011
4012 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004013 *psize = PyUnicode_UTF8_LENGTH(unicode);
4014 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004015}
4016
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004017const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004020 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4021}
4022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023Py_UNICODE *
4024PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 if (!PyUnicode_Check(unicode)) {
4027 PyErr_BadArgument();
4028 return NULL;
4029 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004030 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4031 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004033 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004034 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035
Serhiy Storchakac46db922018-10-23 22:58:24 +03004036 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4037 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4038 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004041 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4042 if (w == NULL) {
4043 PyErr_NoMemory();
4044 return NULL;
4045 }
4046 unicode_copy_as_widechar(unicode, w, wlen + 1);
4047 _PyUnicode_WSTR(unicode) = w;
4048 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4049 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 }
4051 }
4052 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004053 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004054 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004055}
4056
Alexander Belopolsky40018472011-02-26 01:02:56 +00004057Py_UNICODE *
4058PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061}
4062
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004063const Py_UNICODE *
4064_PyUnicode_AsUnicode(PyObject *unicode)
4065{
4066 Py_ssize_t size;
4067 const Py_UNICODE *wstr;
4068
4069 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4070 if (wstr && wcslen(wstr) != (size_t)size) {
4071 PyErr_SetString(PyExc_ValueError, "embedded null character");
4072 return NULL;
4073 }
4074 return wstr;
4075}
4076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077
Alexander Belopolsky40018472011-02-26 01:02:56 +00004078Py_ssize_t
4079PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080{
4081 if (!PyUnicode_Check(unicode)) {
4082 PyErr_BadArgument();
4083 goto onError;
4084 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004085 if (_PyUnicode_WSTR(unicode) == NULL) {
4086 if (PyUnicode_AsUnicode(unicode) == NULL)
4087 goto onError;
4088 }
4089 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 return -1;
4093}
4094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095Py_ssize_t
4096PyUnicode_GetLength(PyObject *unicode)
4097{
Victor Stinner07621332012-06-16 04:53:46 +02004098 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 PyErr_BadArgument();
4100 return -1;
4101 }
Victor Stinner07621332012-06-16 04:53:46 +02004102 if (PyUnicode_READY(unicode) == -1)
4103 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004104 return PyUnicode_GET_LENGTH(unicode);
4105}
4106
4107Py_UCS4
4108PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4109{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004110 void *data;
4111 int kind;
4112
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004113 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004114 PyErr_BadArgument();
4115 return (Py_UCS4)-1;
4116 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004117 if (PyUnicode_READY(unicode) == -1) {
4118 return (Py_UCS4)-1;
4119 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004120 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004121 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122 return (Py_UCS4)-1;
4123 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004124 data = PyUnicode_DATA(unicode);
4125 kind = PyUnicode_KIND(unicode);
4126 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004127}
4128
4129int
4130PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4131{
4132 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004133 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 return -1;
4135 }
Victor Stinner488fa492011-12-12 00:01:39 +01004136 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004137 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004138 PyErr_SetString(PyExc_IndexError, "string index out of range");
4139 return -1;
4140 }
Victor Stinner488fa492011-12-12 00:01:39 +01004141 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004142 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004143 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4144 PyErr_SetString(PyExc_ValueError, "character out of range");
4145 return -1;
4146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4148 index, ch);
4149 return 0;
4150}
4151
Alexander Belopolsky40018472011-02-26 01:02:56 +00004152const char *
4153PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004154{
Victor Stinner42cb4622010-09-01 19:39:01 +00004155 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004156}
4157
Victor Stinner554f3f02010-06-16 23:33:54 +00004158/* create or adjust a UnicodeDecodeError */
4159static void
4160make_decode_exception(PyObject **exceptionObject,
4161 const char *encoding,
4162 const char *input, Py_ssize_t length,
4163 Py_ssize_t startpos, Py_ssize_t endpos,
4164 const char *reason)
4165{
4166 if (*exceptionObject == NULL) {
4167 *exceptionObject = PyUnicodeDecodeError_Create(
4168 encoding, input, length, startpos, endpos, reason);
4169 }
4170 else {
4171 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4172 goto onError;
4173 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4174 goto onError;
4175 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4176 goto onError;
4177 }
4178 return;
4179
4180onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004181 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004182}
4183
Steve Dowercc16be82016-09-08 10:35:16 -07004184#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004185static int
4186widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4187{
4188 if (newsize > *size) {
4189 wchar_t *newbuf = *buf;
4190 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4191 PyErr_NoMemory();
4192 return -1;
4193 }
4194 *buf = newbuf;
4195 }
4196 *size = newsize;
4197 return 0;
4198}
4199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200/* error handling callback helper:
4201 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004202 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 and adjust various state variables.
4204 return 0 on success, -1 on error
4205*/
4206
Alexander Belopolsky40018472011-02-26 01:02:56 +00004207static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004208unicode_decode_call_errorhandler_wchar(
4209 const char *errors, PyObject **errorHandler,
4210 const char *encoding, const char *reason,
4211 const char **input, const char **inend, Py_ssize_t *startinpos,
4212 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004213 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004215 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216
4217 PyObject *restuple = NULL;
4218 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004219 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004220 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004221 Py_ssize_t requiredsize;
4222 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004223 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004224 wchar_t *repwstr;
4225 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226
4227 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 *errorHandler = PyCodec_LookupError(errors);
4229 if (*errorHandler == NULL)
4230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 }
4232
Victor Stinner554f3f02010-06-16 23:33:54 +00004233 make_decode_exception(exceptionObject,
4234 encoding,
4235 *input, *inend - *input,
4236 *startinpos, *endinpos,
4237 reason);
4238 if (*exceptionObject == NULL)
4239 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240
Petr Viktorinffd97532020-02-11 17:46:57 +01004241 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004244 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004245 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004248 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004250
4251 /* Copy back the bytes variables, which might have been modified by the
4252 callback */
4253 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4254 if (!inputobj)
4255 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256 *input = PyBytes_AS_STRING(inputobj);
4257 insize = PyBytes_GET_SIZE(inputobj);
4258 *inend = *input + insize;
4259 /* we can DECREF safely, as the exception has another reference,
4260 so the object won't go away. */
4261 Py_DECREF(inputobj);
4262
4263 if (newpos<0)
4264 newpos = insize+newpos;
4265 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004266 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004267 goto onError;
4268 }
4269
4270 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4271 if (repwstr == NULL)
4272 goto onError;
4273 /* need more space? (at least enough for what we
4274 have+the replacement+the rest of the string (starting
4275 at the new input position), so we won't have to check space
4276 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004277 requiredsize = *outpos;
4278 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4279 goto overflow;
4280 requiredsize += repwlen;
4281 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4282 goto overflow;
4283 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004284 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004285 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004286 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004288 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004290 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004292 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004293 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294 *endinpos = newpos;
4295 *inptr = *input + newpos;
4296
4297 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004298 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 return 0;
4300
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004301 overflow:
4302 PyErr_SetString(PyExc_OverflowError,
4303 "decoded result is too long for a Python string");
4304
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305 onError:
4306 Py_XDECREF(restuple);
4307 return -1;
4308}
Steve Dowercc16be82016-09-08 10:35:16 -07004309#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310
4311static int
4312unicode_decode_call_errorhandler_writer(
4313 const char *errors, PyObject **errorHandler,
4314 const char *encoding, const char *reason,
4315 const char **input, const char **inend, Py_ssize_t *startinpos,
4316 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4317 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4318{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004319 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320
4321 PyObject *restuple = NULL;
4322 PyObject *repunicode = NULL;
4323 Py_ssize_t insize;
4324 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004325 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004326 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004328 int need_to_grow = 0;
4329 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330
4331 if (*errorHandler == NULL) {
4332 *errorHandler = PyCodec_LookupError(errors);
4333 if (*errorHandler == NULL)
4334 goto onError;
4335 }
4336
4337 make_decode_exception(exceptionObject,
4338 encoding,
4339 *input, *inend - *input,
4340 *startinpos, *endinpos,
4341 reason);
4342 if (*exceptionObject == NULL)
4343 goto onError;
4344
Petr Viktorinffd97532020-02-11 17:46:57 +01004345 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346 if (restuple == NULL)
4347 goto onError;
4348 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004349 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 goto onError;
4351 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004352 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004353 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004354
4355 /* Copy back the bytes variables, which might have been modified by the
4356 callback */
4357 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4358 if (!inputobj)
4359 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004360 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004361 *input = PyBytes_AS_STRING(inputobj);
4362 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004363 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004364 /* we can DECREF safely, as the exception has another reference,
4365 so the object won't go away. */
4366 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004370 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004371 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004373 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374
Victor Stinner170ca6f2013-04-18 00:25:28 +02004375 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004376 if (replen > 1) {
4377 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004378 need_to_grow = 1;
4379 }
4380 new_inptr = *input + newpos;
4381 if (*inend - new_inptr > remain) {
4382 /* We don't know the decoding algorithm here so we make the worst
4383 assumption that one byte decodes to one unicode character.
4384 If unfortunately one byte could decode to more unicode characters,
4385 the decoder may write out-of-bound then. Is it possible for the
4386 algorithms using this function? */
4387 writer->min_length += *inend - new_inptr - remain;
4388 need_to_grow = 1;
4389 }
4390 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004391 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004392 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004393 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4394 goto onError;
4395 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004396 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004397 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004400 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004401
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004403 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004404 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409}
4410
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411/* --- UTF-7 Codec -------------------------------------------------------- */
4412
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413/* See RFC2152 for details. We encode conservatively and decode liberally. */
4414
4415/* Three simple macros defining base-64. */
4416
4417/* Is c a base-64 character? */
4418
4419#define IS_BASE64(c) \
4420 (((c) >= 'A' && (c) <= 'Z') || \
4421 ((c) >= 'a' && (c) <= 'z') || \
4422 ((c) >= '0' && (c) <= '9') || \
4423 (c) == '+' || (c) == '/')
4424
4425/* given that c is a base-64 character, what is its base-64 value? */
4426
4427#define FROM_BASE64(c) \
4428 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4429 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4430 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4431 (c) == '+' ? 62 : 63)
4432
4433/* What is the base-64 character of the bottom 6 bits of n? */
4434
4435#define TO_BASE64(n) \
4436 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4437
4438/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4439 * decoded as itself. We are permissive on decoding; the only ASCII
4440 * byte not decoding to itself is the + which begins a base64
4441 * string. */
4442
4443#define DECODE_DIRECT(c) \
4444 ((c) <= 127 && (c) != '+')
4445
4446/* The UTF-7 encoder treats ASCII characters differently according to
4447 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4448 * the above). See RFC2152. This array identifies these different
4449 * sets:
4450 * 0 : "Set D"
4451 * alphanumeric and '(),-./:?
4452 * 1 : "Set O"
4453 * !"#$%&*;<=>@[]^_`{|}
4454 * 2 : "whitespace"
4455 * ht nl cr sp
4456 * 3 : special (must be base64 encoded)
4457 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4458 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459
Tim Petersced69f82003-09-16 20:30:58 +00004460static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461char utf7_category[128] = {
4462/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4463 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4464/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4465 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4466/* sp ! " # $ % & ' ( ) * + , - . / */
4467 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4468/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4470/* @ A B C D E F G H I J K L M N O */
4471 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4472/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4474/* ` a b c d e f g h i j k l m n o */
4475 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4476/* p q r s t u v w x y z { | } ~ del */
4477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478};
4479
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480/* ENCODE_DIRECT: this character should be encoded as itself. The
4481 * answer depends on whether we are encoding set O as itself, and also
4482 * on whether we are encoding whitespace as itself. RFC2152 makes it
4483 * clear that the answers to these questions vary between
4484 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004485
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486#define ENCODE_DIRECT(c, directO, directWS) \
4487 ((c) < 128 && (c) > 0 && \
4488 ((utf7_category[(c)] == 0) || \
4489 (directWS && (utf7_category[(c)] == 2)) || \
4490 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004491
Alexander Belopolsky40018472011-02-26 01:02:56 +00004492PyObject *
4493PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004494 Py_ssize_t size,
4495 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004497 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4498}
4499
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500/* The decoder. The only state we preserve is our read position,
4501 * i.e. how many characters we have consumed. So if we end in the
4502 * middle of a shift sequence we have to back off the read position
4503 * and the output to the beginning of the sequence, otherwise we lose
4504 * all the shift state (seen bits, number of bits seen, high
4505 * surrogate). */
4506
Alexander Belopolsky40018472011-02-26 01:02:56 +00004507PyObject *
4508PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004509 Py_ssize_t size,
4510 const char *errors,
4511 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004512{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004513 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004514 Py_ssize_t startinpos;
4515 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004517 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 const char *errmsg = "";
4519 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004520 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 unsigned int base64bits = 0;
4522 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004523 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004524 PyObject *errorHandler = NULL;
4525 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004527 if (size == 0) {
4528 if (consumed)
4529 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004530 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004531 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004534 _PyUnicodeWriter_Init(&writer);
4535 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004536
4537 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 e = s + size;
4539
4540 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004541 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004543 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 if (inShift) { /* in a base-64 section */
4546 if (IS_BASE64(ch)) { /* consume a base-64 character */
4547 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4548 base64bits += 6;
4549 s++;
4550 if (base64bits >= 16) {
4551 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004552 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 base64bits -= 16;
4554 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004555 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004556 if (surrogate) {
4557 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004558 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4559 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004560 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004561 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004563 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
4565 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004566 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004567 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 }
4570 }
Victor Stinner551ac952011-11-29 22:58:13 +01004571 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 /* first surrogate */
4573 surrogate = outCh;
4574 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004576 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004577 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 }
4579 }
4580 }
4581 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 if (base64bits > 0) { /* left-over bits */
4584 if (base64bits >= 6) {
4585 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004586 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 errmsg = "partial character in shift sequence";
4588 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 else {
4591 /* Some bits remain; they should be zero */
4592 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004593 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 errmsg = "non-zero padding bits in shift sequence";
4595 goto utf7Error;
4596 }
4597 }
4598 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004599 if (surrogate && DECODE_DIRECT(ch)) {
4600 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4601 goto onError;
4602 }
4603 surrogate = 0;
4604 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 /* '-' is absorbed; other terminating
4606 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004607 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609 }
4610 }
4611 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004613 s++; /* consume '+' */
4614 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004616 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004617 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004618 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004619 else if (s < e && !IS_BASE64(*s)) {
4620 s++;
4621 errmsg = "ill-formed sequence";
4622 goto utf7Error;
4623 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004626 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004627 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004629 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 }
4631 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004632 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004634 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004637 else {
4638 startinpos = s-starts;
4639 s++;
4640 errmsg = "unexpected special character";
4641 goto utf7Error;
4642 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 errors, &errorHandler,
4648 "utf7", errmsg,
4649 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004650 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004651 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
4653
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 /* end of string */
4655
4656 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4657 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004658 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004659 if (surrogate ||
4660 (base64bits >= 6) ||
4661 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 errors, &errorHandler,
4665 "utf7", "unterminated shift sequence",
4666 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004668 goto onError;
4669 if (s < e)
4670 goto restart;
4671 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004672 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004673
4674 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004677 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004678 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004679 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004680 writer.kind, writer.data, shiftOutStart);
4681 Py_XDECREF(errorHandler);
4682 Py_XDECREF(exc);
4683 _PyUnicodeWriter_Dealloc(&writer);
4684 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004685 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004686 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 }
4688 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004689 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004691 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 Py_XDECREF(errorHandler);
4694 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004695 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004696
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 Py_XDECREF(errorHandler);
4699 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004700 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 return NULL;
4702}
4703
4704
Alexander Belopolsky40018472011-02-26 01:02:56 +00004705PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004706_PyUnicode_EncodeUTF7(PyObject *str,
4707 int base64SetO,
4708 int base64WhiteSpace,
4709 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004710{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004711 int kind;
4712 void *data;
4713 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004714 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004715 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004717 unsigned int base64bits = 0;
4718 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719 char * out;
4720 char * start;
4721
Benjamin Petersonbac79492012-01-14 13:34:47 -05004722 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723 return NULL;
4724 kind = PyUnicode_KIND(str);
4725 data = PyUnicode_DATA(str);
4726 len = PyUnicode_GET_LENGTH(str);
4727
4728 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004729 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004731 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004732 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004733 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004734 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 if (v == NULL)
4736 return NULL;
4737
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004738 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004740 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 if (inShift) {
4743 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4744 /* shifting out */
4745 if (base64bits) { /* output remaining bits */
4746 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4747 base64buffer = 0;
4748 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749 }
4750 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004751 /* Characters not in the BASE64 set implicitly unshift the sequence
4752 so no '-' is required, except if the character is itself a '-' */
4753 if (IS_BASE64(ch) || ch == '-') {
4754 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 *out++ = (char) ch;
4757 }
4758 else {
4759 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004760 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004762 else { /* not in a shift sequence */
4763 if (ch == '+') {
4764 *out++ = '+';
4765 *out++ = '-';
4766 }
4767 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4768 *out++ = (char) ch;
4769 }
4770 else {
4771 *out++ = '+';
4772 inShift = 1;
4773 goto encode_char;
4774 }
4775 }
4776 continue;
4777encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004779 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004780
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 /* code first surrogate */
4782 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004783 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004784 while (base64bits >= 6) {
4785 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4786 base64bits -= 6;
4787 }
4788 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004789 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 base64bits += 16;
4792 base64buffer = (base64buffer << 16) | ch;
4793 while (base64bits >= 6) {
4794 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4795 base64bits -= 6;
4796 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004797 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 if (base64bits)
4799 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4800 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004801 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004802 if (_PyBytes_Resize(&v, out - start) < 0)
4803 return NULL;
4804 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004805}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004806PyObject *
4807PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4808 Py_ssize_t size,
4809 int base64SetO,
4810 int base64WhiteSpace,
4811 const char *errors)
4812{
4813 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004814 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004815 if (tmp == NULL)
4816 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004817 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004818 base64WhiteSpace, errors);
4819 Py_DECREF(tmp);
4820 return result;
4821}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004822
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823#undef IS_BASE64
4824#undef FROM_BASE64
4825#undef TO_BASE64
4826#undef DECODE_DIRECT
4827#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829/* --- UTF-8 Codec -------------------------------------------------------- */
4830
Alexander Belopolsky40018472011-02-26 01:02:56 +00004831PyObject *
4832PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004833 Py_ssize_t size,
4834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835{
Walter Dörwald69652032004-09-07 20:24:22 +00004836 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4837}
4838
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839#include "stringlib/asciilib.h"
4840#include "stringlib/codecs.h"
4841#include "stringlib/undef.h"
4842
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004843#include "stringlib/ucs1lib.h"
4844#include "stringlib/codecs.h"
4845#include "stringlib/undef.h"
4846
4847#include "stringlib/ucs2lib.h"
4848#include "stringlib/codecs.h"
4849#include "stringlib/undef.h"
4850
4851#include "stringlib/ucs4lib.h"
4852#include "stringlib/codecs.h"
4853#include "stringlib/undef.h"
4854
Antoine Pitrouab868312009-01-10 15:40:25 +00004855/* Mask to quickly check whether a C 'long' contains a
4856 non-ASCII, UTF8-encoded char. */
4857#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004858# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004859#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004860# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004861#else
4862# error C 'long' size should be either 4 or 8!
4863#endif
4864
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004865static Py_ssize_t
4866ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004868 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004869 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004871 /*
4872 * Issue #17237: m68k is a bit different from most architectures in
4873 * that objects do not use "natural alignment" - for example, int and
4874 * long are only aligned at 2-byte boundaries. Therefore the assert()
4875 * won't work; also, tests have shown that skipping the "optimised
4876 * version" will even speed up m68k.
4877 */
4878#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004879#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004880 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4881 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 /* Fast path, see in STRINGLIB(utf8_decode) for
4883 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004884 /* Help allocation */
4885 const char *_p = p;
4886 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004887 while (_p < aligned_end) {
4888 unsigned long value = *(const unsigned long *) _p;
4889 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 *((unsigned long *)q) = value;
4892 _p += SIZEOF_LONG;
4893 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004894 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895 p = _p;
4896 while (p < end) {
4897 if ((unsigned char)*p & 0x80)
4898 break;
4899 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004904#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 while (p < end) {
4906 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4907 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004908 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004909 /* Help allocation */
4910 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004912 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 if (value & ASCII_CHAR_MASK)
4914 break;
4915 _p += SIZEOF_LONG;
4916 }
4917 p = _p;
4918 if (_p == end)
4919 break;
4920 }
4921 if ((unsigned char)*p & 0x80)
4922 break;
4923 ++p;
4924 }
4925 memcpy(dest, start, p - start);
4926 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927}
Antoine Pitrouab868312009-01-10 15:40:25 +00004928
Victor Stinner709d23d2019-05-02 14:56:30 -04004929static PyObject *
4930unicode_decode_utf8(const char *s, Py_ssize_t size,
4931 _Py_error_handler error_handler, const char *errors,
4932 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004933{
Victor Stinner785938e2011-12-11 20:09:03 +01004934 if (size == 0) {
4935 if (consumed)
4936 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004937 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004938 }
4939
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004940 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4941 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004942 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004943 *consumed = 1;
4944 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004945 }
4946
Inada Naoki770847a2019-06-24 12:30:24 +09004947 const char *starts = s;
4948 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004949
Inada Naoki770847a2019-06-24 12:30:24 +09004950 // fast path: try ASCII string.
4951 PyObject *u = PyUnicode_New(size, 127);
4952 if (u == NULL) {
4953 return NULL;
4954 }
4955 s += ascii_decode(s, end, PyUnicode_DATA(u));
4956 if (s == end) {
4957 return u;
4958 }
4959
4960 // Use _PyUnicodeWriter after fast path is failed.
4961 _PyUnicodeWriter writer;
4962 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4963 writer.pos = s - starts;
4964
4965 Py_ssize_t startinpos, endinpos;
4966 const char *errmsg = "";
4967 PyObject *error_handler_obj = NULL;
4968 PyObject *exc = NULL;
4969
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004970 while (s < end) {
4971 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004973
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004974 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004975 if (PyUnicode_IS_ASCII(writer.buffer))
4976 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004978 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004981 } else {
4982 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004983 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 }
4985
4986 switch (ch) {
4987 case 0:
4988 if (s == end || consumed)
4989 goto End;
4990 errmsg = "unexpected end of data";
4991 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004992 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 break;
4994 case 1:
4995 errmsg = "invalid start byte";
4996 startinpos = s - starts;
4997 endinpos = startinpos + 1;
4998 break;
4999 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005000 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5001 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5002 {
5003 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005004 goto End;
5005 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005006 /* fall through */
5007 case 3:
5008 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 errmsg = "invalid continuation byte";
5010 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005011 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 break;
5013 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005014 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 goto onError;
5016 continue;
5017 }
5018
Victor Stinner1d65d912015-10-05 13:43:50 +02005019 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005020 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005021
5022 switch (error_handler) {
5023 case _Py_ERROR_IGNORE:
5024 s += (endinpos - startinpos);
5025 break;
5026
5027 case _Py_ERROR_REPLACE:
5028 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5029 goto onError;
5030 s += (endinpos - startinpos);
5031 break;
5032
5033 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005034 {
5035 Py_ssize_t i;
5036
Victor Stinner1d65d912015-10-05 13:43:50 +02005037 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5038 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005039 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005040 ch = (Py_UCS4)(unsigned char)(starts[i]);
5041 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5042 ch + 0xdc00);
5043 writer.pos++;
5044 }
5045 s += (endinpos - startinpos);
5046 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005047 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005048
5049 default:
5050 if (unicode_decode_call_errorhandler_writer(
5051 errors, &error_handler_obj,
5052 "utf-8", errmsg,
5053 &starts, &end, &startinpos, &endinpos, &exc, &s,
5054 &writer))
5055 goto onError;
5056 }
Victor Stinner785938e2011-12-11 20:09:03 +01005057 }
5058
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005059End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005060 if (consumed)
5061 *consumed = s - starts;
5062
Victor Stinner1d65d912015-10-05 13:43:50 +02005063 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005065 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005066
5067onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005068 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005070 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005072}
5073
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074
Victor Stinner709d23d2019-05-02 14:56:30 -04005075PyObject *
5076PyUnicode_DecodeUTF8Stateful(const char *s,
5077 Py_ssize_t size,
5078 const char *errors,
5079 Py_ssize_t *consumed)
5080{
5081 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5082}
5083
5084
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005085/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5086 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005087
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 On success, write a pointer to a newly allocated wide character string into
5089 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5090 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005091
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005092 On memory allocation failure, return -1.
5093
5094 On decoding error (if surrogateescape is zero), return -2. If wlen is
5095 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5096 is not NULL, write the decoding error message into *reason. */
5097int
5098_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005099 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005101 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005102 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 wchar_t *unicode;
5104 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005105
Victor Stinner3d4226a2018-08-29 22:21:32 +02005106 int surrogateescape = 0;
5107 int surrogatepass = 0;
5108 switch (errors)
5109 {
5110 case _Py_ERROR_STRICT:
5111 break;
5112 case _Py_ERROR_SURROGATEESCAPE:
5113 surrogateescape = 1;
5114 break;
5115 case _Py_ERROR_SURROGATEPASS:
5116 surrogatepass = 1;
5117 break;
5118 default:
5119 return -3;
5120 }
5121
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122 /* Note: size will always be longer than the resulting Unicode
5123 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005124 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005125 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005126 }
5127
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005128 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005129 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005130 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005131 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132
5133 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 if (ch > 0xFF) {
5144#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005145 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005147 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005148 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5150 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5151#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005154 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005157
5158 if (surrogateescape) {
5159 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5160 }
5161 else {
5162 /* Is it a valid three-byte code? */
5163 if (surrogatepass
5164 && (e - s) >= 3
5165 && (s[0] & 0xf0) == 0xe0
5166 && (s[1] & 0xc0) == 0x80
5167 && (s[2] & 0xc0) == 0x80)
5168 {
5169 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5170 s += 3;
5171 unicode[outpos++] = ch;
5172 }
5173 else {
5174 PyMem_RawFree(unicode );
5175 if (reason != NULL) {
5176 switch (ch) {
5177 case 0:
5178 *reason = "unexpected end of data";
5179 break;
5180 case 1:
5181 *reason = "invalid start byte";
5182 break;
5183 /* 2, 3, 4 */
5184 default:
5185 *reason = "invalid continuation byte";
5186 break;
5187 }
5188 }
5189 if (wlen != NULL) {
5190 *wlen = s - orig_s;
5191 }
5192 return -2;
5193 }
5194 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005195 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005196 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005197 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005198 if (wlen) {
5199 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005200 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 *wstr = unicode;
5202 return 0;
5203}
5204
Victor Stinner5f9cf232019-03-19 01:46:25 +01005205
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005206wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005207_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5208 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209{
5210 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005211 int res = _Py_DecodeUTF8Ex(arg, arglen,
5212 &wstr, wlen,
5213 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005214 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005215 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5216 assert(res != -3);
5217 if (wlen) {
5218 *wlen = (size_t)res;
5219 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005220 return NULL;
5221 }
5222 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005223}
5224
Antoine Pitrouab868312009-01-10 15:40:25 +00005225
Victor Stinnere47e6982017-12-21 15:45:16 +01005226/* UTF-8 encoder using the surrogateescape error handler .
5227
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005228 On success, return 0 and write the newly allocated character string (use
5229 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005230
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005231 On encoding failure, return -2 and write the position of the invalid
5232 surrogate character into *error_pos (if error_pos is set) and the decoding
5233 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005234
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005235 On memory allocation failure, return -1. */
5236int
5237_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005238 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005239{
5240 const Py_ssize_t max_char_size = 4;
5241 Py_ssize_t len = wcslen(text);
5242
5243 assert(len >= 0);
5244
Victor Stinner3d4226a2018-08-29 22:21:32 +02005245 int surrogateescape = 0;
5246 int surrogatepass = 0;
5247 switch (errors)
5248 {
5249 case _Py_ERROR_STRICT:
5250 break;
5251 case _Py_ERROR_SURROGATEESCAPE:
5252 surrogateescape = 1;
5253 break;
5254 case _Py_ERROR_SURROGATEPASS:
5255 surrogatepass = 1;
5256 break;
5257 default:
5258 return -3;
5259 }
5260
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005261 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5262 return -1;
5263 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005264 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005265 if (raw_malloc) {
5266 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005267 }
5268 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005269 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005270 }
5271 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005272 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005273 }
5274
5275 char *p = bytes;
5276 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005277 for (i = 0; i < len; ) {
5278 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005280 i++;
5281#if Py_UNICODE_SIZE == 2
5282 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5283 && i < len
5284 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5285 {
5286 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5287 i++;
5288 }
5289#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005290
5291 if (ch < 0x80) {
5292 /* Encode ASCII */
5293 *p++ = (char) ch;
5294
5295 }
5296 else if (ch < 0x0800) {
5297 /* Encode Latin-1 */
5298 *p++ = (char)(0xc0 | (ch >> 6));
5299 *p++ = (char)(0x80 | (ch & 0x3f));
5300 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005301 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005302 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005303 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005304 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005305 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005306 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005307 if (reason != NULL) {
5308 *reason = "encoding error";
5309 }
5310 if (raw_malloc) {
5311 PyMem_RawFree(bytes);
5312 }
5313 else {
5314 PyMem_Free(bytes);
5315 }
5316 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005317 }
5318 *p++ = (char)(ch & 0xff);
5319 }
5320 else if (ch < 0x10000) {
5321 *p++ = (char)(0xe0 | (ch >> 12));
5322 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5323 *p++ = (char)(0x80 | (ch & 0x3f));
5324 }
5325 else { /* ch >= 0x10000 */
5326 assert(ch <= MAX_UNICODE);
5327 /* Encode UCS4 Unicode ordinals */
5328 *p++ = (char)(0xf0 | (ch >> 18));
5329 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5330 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5331 *p++ = (char)(0x80 | (ch & 0x3f));
5332 }
5333 }
5334 *p++ = '\0';
5335
5336 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005337 char *bytes2;
5338 if (raw_malloc) {
5339 bytes2 = PyMem_RawRealloc(bytes, final_size);
5340 }
5341 else {
5342 bytes2 = PyMem_Realloc(bytes, final_size);
5343 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005344 if (bytes2 == NULL) {
5345 if (error_pos != NULL) {
5346 *error_pos = (size_t)-1;
5347 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005348 if (raw_malloc) {
5349 PyMem_RawFree(bytes);
5350 }
5351 else {
5352 PyMem_Free(bytes);
5353 }
5354 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005355 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005356 *str = bytes2;
5357 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005358}
5359
5360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361/* Primary internal function which creates utf8 encoded bytes objects.
5362
5363 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005364 and allocate exactly as much space needed at the end. Else allocate the
5365 maximum possible needed (4 result bytes per Unicode character), and return
5366 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005367*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005368static PyObject *
5369unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5370 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 if (!PyUnicode_Check(unicode)) {
5373 PyErr_BadArgument();
5374 return NULL;
5375 }
5376
5377 if (PyUnicode_READY(unicode) == -1)
5378 return NULL;
5379
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005380 if (PyUnicode_UTF8(unicode))
5381 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5382 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383
Inada Naoki02a4d572020-02-27 13:48:59 +09005384 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5385 void *data = PyUnicode_DATA(unicode);
5386 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5387
5388 _PyBytesWriter writer;
5389 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005390
Benjamin Petersonead6b532011-12-20 17:23:42 -06005391 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005392 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005393 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005394 case PyUnicode_1BYTE_KIND:
5395 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5396 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005397 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5398 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005399 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005400 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5401 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005402 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005403 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5404 break;
Tim Peters602f7402002-04-27 18:03:26 +00005405 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005406
5407 if (end == NULL) {
5408 _PyBytesWriter_Dealloc(&writer);
5409 return NULL;
5410 }
5411 return _PyBytesWriter_Finish(&writer, end);
5412}
5413
5414static int
5415unicode_fill_utf8(PyObject *unicode)
5416{
5417 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5418 assert(!PyUnicode_IS_ASCII(unicode));
5419
5420 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5421 void *data = PyUnicode_DATA(unicode);
5422 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5423
5424 _PyBytesWriter writer;
5425 char *end;
5426
5427 switch (kind) {
5428 default:
5429 Py_UNREACHABLE();
5430 case PyUnicode_1BYTE_KIND:
5431 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5432 _Py_ERROR_STRICT, NULL);
5433 break;
5434 case PyUnicode_2BYTE_KIND:
5435 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5436 _Py_ERROR_STRICT, NULL);
5437 break;
5438 case PyUnicode_4BYTE_KIND:
5439 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5440 _Py_ERROR_STRICT, NULL);
5441 break;
5442 }
5443 if (end == NULL) {
5444 _PyBytesWriter_Dealloc(&writer);
5445 return -1;
5446 }
5447
5448 char *start = writer.use_small_buffer ? writer.small_buffer :
5449 PyBytes_AS_STRING(writer.buffer);
5450 Py_ssize_t len = end - start;
5451
5452 char *cache = PyObject_MALLOC(len + 1);
5453 if (cache == NULL) {
5454 _PyBytesWriter_Dealloc(&writer);
5455 PyErr_NoMemory();
5456 return -1;
5457 }
5458 _PyUnicode_UTF8(unicode) = cache;
5459 _PyUnicode_UTF8_LENGTH(unicode) = len;
5460 memcpy(cache, start, len);
5461 cache[len] = '\0';
5462 _PyBytesWriter_Dealloc(&writer);
5463 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464}
5465
Alexander Belopolsky40018472011-02-26 01:02:56 +00005466PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005467_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5468{
5469 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5470}
5471
5472
5473PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5475 Py_ssize_t size,
5476 const char *errors)
5477{
5478 PyObject *v, *unicode;
5479
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005480 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005481 if (unicode == NULL)
5482 return NULL;
5483 v = _PyUnicode_AsUTF8String(unicode, errors);
5484 Py_DECREF(unicode);
5485 return v;
5486}
5487
5488PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005489PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005491 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492}
5493
Walter Dörwald41980ca2007-08-16 21:55:45 +00005494/* --- UTF-32 Codec ------------------------------------------------------- */
5495
5496PyObject *
5497PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 Py_ssize_t size,
5499 const char *errors,
5500 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005501{
5502 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5503}
5504
5505PyObject *
5506PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 Py_ssize_t size,
5508 const char *errors,
5509 int *byteorder,
5510 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005511{
5512 const char *starts = s;
5513 Py_ssize_t startinpos;
5514 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005515 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005516 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005517 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005518 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005519 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520 PyObject *errorHandler = NULL;
5521 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005522
Andy Lestere6be9b52020-02-11 20:28:35 -06005523 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524 e = q + size;
5525
5526 if (byteorder)
5527 bo = *byteorder;
5528
5529 /* Check for BOM marks (U+FEFF) in the input and adjust current
5530 byte order setting accordingly. In native mode, the leading BOM
5531 mark is skipped, in all other modes, it is copied to the output
5532 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005534 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005535 if (bom == 0x0000FEFF) {
5536 bo = -1;
5537 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 else if (bom == 0xFFFE0000) {
5540 bo = 1;
5541 q += 4;
5542 }
5543 if (byteorder)
5544 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005545 }
5546
Victor Stinnere64322e2012-10-30 23:12:47 +01005547 if (q == e) {
5548 if (consumed)
5549 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005550 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005551 }
5552
Victor Stinnere64322e2012-10-30 23:12:47 +01005553#ifdef WORDS_BIGENDIAN
5554 le = bo < 0;
5555#else
5556 le = bo <= 0;
5557#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005558 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005559
Victor Stinner8f674cc2013-04-17 23:02:17 +02005560 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005561 writer.min_length = (e - q + 3) / 4;
5562 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005563 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005564
Victor Stinnere64322e2012-10-30 23:12:47 +01005565 while (1) {
5566 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005567 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005568
Victor Stinnere64322e2012-10-30 23:12:47 +01005569 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005570 enum PyUnicode_Kind kind = writer.kind;
5571 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005573 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 if (le) {
5575 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005576 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005577 if (ch > maxch)
5578 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005579 if (kind != PyUnicode_1BYTE_KIND &&
5580 Py_UNICODE_IS_SURROGATE(ch))
5581 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005582 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005583 q += 4;
5584 } while (q <= last);
5585 }
5586 else {
5587 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005588 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005589 if (ch > maxch)
5590 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005591 if (kind != PyUnicode_1BYTE_KIND &&
5592 Py_UNICODE_IS_SURROGATE(ch))
5593 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005594 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005595 q += 4;
5596 } while (q <= last);
5597 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005598 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005599 }
5600
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005601 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005602 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005603 startinpos = ((const char *)q) - starts;
5604 endinpos = startinpos + 4;
5605 }
5606 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005607 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005609 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005611 startinpos = ((const char *)q) - starts;
5612 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005614 else {
5615 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005616 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005617 goto onError;
5618 q += 4;
5619 continue;
5620 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005621 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005622 startinpos = ((const char *)q) - starts;
5623 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005625
5626 /* The remaining input chars are ignored if the callback
5627 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005628 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005630 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005632 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005634 }
5635
Walter Dörwald41980ca2007-08-16 21:55:45 +00005636 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005638
Walter Dörwald41980ca2007-08-16 21:55:45 +00005639 Py_XDECREF(errorHandler);
5640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005641 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005642
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005644 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005645 Py_XDECREF(errorHandler);
5646 Py_XDECREF(exc);
5647 return NULL;
5648}
5649
5650PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005651_PyUnicode_EncodeUTF32(PyObject *str,
5652 const char *errors,
5653 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005654{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005655 enum PyUnicode_Kind kind;
5656 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005658 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005659 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005660#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005662#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005664#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005666 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 PyObject *errorHandler = NULL;
5668 PyObject *exc = NULL;
5669 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005670
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005671 if (!PyUnicode_Check(str)) {
5672 PyErr_BadArgument();
5673 return NULL;
5674 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005675 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005676 return NULL;
5677 kind = PyUnicode_KIND(str);
5678 data = PyUnicode_DATA(str);
5679 len = PyUnicode_GET_LENGTH(str);
5680
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005681 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005682 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005683 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005684 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005685 if (v == NULL)
5686 return NULL;
5687
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005688 /* output buffer is 4-bytes aligned */
5689 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005690 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005691 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005692 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005693 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005694 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005695
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005696 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005697 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005699 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005700 else
5701 encoding = "utf-32";
5702
5703 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005704 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5705 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005706 }
5707
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005708 pos = 0;
5709 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005711
5712 if (kind == PyUnicode_2BYTE_KIND) {
5713 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5714 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005716 else {
5717 assert(kind == PyUnicode_4BYTE_KIND);
5718 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5719 &out, native_ordering);
5720 }
5721 if (pos == len)
5722 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005723
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 rep = unicode_encode_call_errorhandler(
5725 errors, &errorHandler,
5726 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005727 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 if (!rep)
5729 goto error;
5730
5731 if (PyBytes_Check(rep)) {
5732 repsize = PyBytes_GET_SIZE(rep);
5733 if (repsize & 3) {
5734 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005735 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 "surrogates not allowed");
5737 goto error;
5738 }
5739 moreunits = repsize / 4;
5740 }
5741 else {
5742 assert(PyUnicode_Check(rep));
5743 if (PyUnicode_READY(rep) < 0)
5744 goto error;
5745 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5746 if (!PyUnicode_IS_ASCII(rep)) {
5747 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005748 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005749 "surrogates not allowed");
5750 goto error;
5751 }
5752 }
5753
5754 /* four bytes are reserved for each surrogate */
5755 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005756 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005757 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005758 /* integer overflow */
5759 PyErr_NoMemory();
5760 goto error;
5761 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005762 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005764 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005765 }
5766
5767 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005768 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005769 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005770 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005772 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5773 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 }
5775
5776 Py_CLEAR(rep);
5777 }
5778
5779 /* Cut back to size actually needed. This is necessary for, for example,
5780 encoding of a string containing isolated surrogates and the 'ignore'
5781 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005782 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 if (nsize != PyBytes_GET_SIZE(v))
5784 _PyBytes_Resize(&v, nsize);
5785 Py_XDECREF(errorHandler);
5786 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005787 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005788 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005789 error:
5790 Py_XDECREF(rep);
5791 Py_XDECREF(errorHandler);
5792 Py_XDECREF(exc);
5793 Py_XDECREF(v);
5794 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005795}
5796
Alexander Belopolsky40018472011-02-26 01:02:56 +00005797PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005798PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5799 Py_ssize_t size,
5800 const char *errors,
5801 int byteorder)
5802{
5803 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005804 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005805 if (tmp == NULL)
5806 return NULL;
5807 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5808 Py_DECREF(tmp);
5809 return result;
5810}
5811
5812PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005813PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005814{
Victor Stinnerb960b342011-11-20 19:12:52 +01005815 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005816}
5817
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818/* --- UTF-16 Codec ------------------------------------------------------- */
5819
Tim Peters772747b2001-08-09 22:21:55 +00005820PyObject *
5821PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 Py_ssize_t size,
5823 const char *errors,
5824 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825{
Walter Dörwald69652032004-09-07 20:24:22 +00005826 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5827}
5828
5829PyObject *
5830PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 Py_ssize_t size,
5832 const char *errors,
5833 int *byteorder,
5834 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005835{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005837 Py_ssize_t startinpos;
5838 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005839 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005840 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005841 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005842 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005843 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 PyObject *errorHandler = NULL;
5845 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
Andy Lestere6be9b52020-02-11 20:28:35 -06005848 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005849 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850
5851 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005852 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005854 /* Check for BOM marks (U+FEFF) in the input and adjust current
5855 byte order setting accordingly. In native mode, the leading BOM
5856 mark is skipped, in all other modes, it is copied to the output
5857 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005858 if (bo == 0 && size >= 2) {
5859 const Py_UCS4 bom = (q[1] << 8) | q[0];
5860 if (bom == 0xFEFF) {
5861 q += 2;
5862 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005864 else if (bom == 0xFFFE) {
5865 q += 2;
5866 bo = 1;
5867 }
5868 if (byteorder)
5869 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871
Antoine Pitrou63065d72012-05-15 23:48:04 +02005872 if (q == e) {
5873 if (consumed)
5874 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005875 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005876 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005877
Christian Heimes743e0cd2012-10-17 23:52:17 +02005878#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005879 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005880 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005881#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005882 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005883 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005884#endif
Tim Peters772747b2001-08-09 22:21:55 +00005885
Antoine Pitrou63065d72012-05-15 23:48:04 +02005886 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005887 character count normally. Error handler will take care of
5888 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005889 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005890 writer.min_length = (e - q + 1) / 2;
5891 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005892 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005893
Antoine Pitrou63065d72012-05-15 23:48:04 +02005894 while (1) {
5895 Py_UCS4 ch = 0;
5896 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005897 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005898 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005899 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005900 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005901 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005902 native_ordering);
5903 else
5904 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005906 native_ordering);
5907 } else if (kind == PyUnicode_2BYTE_KIND) {
5908 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005909 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005910 native_ordering);
5911 } else {
5912 assert(kind == PyUnicode_4BYTE_KIND);
5913 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005914 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005915 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005916 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005917 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918
Antoine Pitrou63065d72012-05-15 23:48:04 +02005919 switch (ch)
5920 {
5921 case 0:
5922 /* remaining byte at the end? (size should be even) */
5923 if (q == e || consumed)
5924 goto End;
5925 errmsg = "truncated data";
5926 startinpos = ((const char *)q) - starts;
5927 endinpos = ((const char *)e) - starts;
5928 break;
5929 /* The remaining input chars are ignored if the callback
5930 chooses to skip the input */
5931 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005932 q -= 2;
5933 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005934 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005935 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005936 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005937 endinpos = ((const char *)e) - starts;
5938 break;
5939 case 2:
5940 errmsg = "illegal encoding";
5941 startinpos = ((const char *)q) - 2 - starts;
5942 endinpos = startinpos + 2;
5943 break;
5944 case 3:
5945 errmsg = "illegal UTF-16 surrogate";
5946 startinpos = ((const char *)q) - 4 - starts;
5947 endinpos = startinpos + 2;
5948 break;
5949 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005950 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005951 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 continue;
5953 }
5954
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005955 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005956 errors,
5957 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005958 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005959 &starts,
5960 (const char **)&e,
5961 &startinpos,
5962 &endinpos,
5963 &exc,
5964 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005965 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 }
5968
Antoine Pitrou63065d72012-05-15 23:48:04 +02005969End:
Walter Dörwald69652032004-09-07 20:24:22 +00005970 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005972
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 Py_XDECREF(errorHandler);
5974 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005978 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005979 Py_XDECREF(errorHandler);
5980 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 return NULL;
5982}
5983
Tim Peters772747b2001-08-09 22:21:55 +00005984PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005985_PyUnicode_EncodeUTF16(PyObject *str,
5986 const char *errors,
5987 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 enum PyUnicode_Kind kind;
5990 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005991 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005992 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005993 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005994 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005995#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005996 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005997#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005998 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005999#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006000 const char *encoding;
6001 Py_ssize_t nsize, pos;
6002 PyObject *errorHandler = NULL;
6003 PyObject *exc = NULL;
6004 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006005
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006006 if (!PyUnicode_Check(str)) {
6007 PyErr_BadArgument();
6008 return NULL;
6009 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006010 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011 return NULL;
6012 kind = PyUnicode_KIND(str);
6013 data = PyUnicode_DATA(str);
6014 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006015
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006017 if (kind == PyUnicode_4BYTE_KIND) {
6018 const Py_UCS4 *in = (const Py_UCS4 *)data;
6019 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006020 while (in < end) {
6021 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006022 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006023 }
6024 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006025 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006026 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006028 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006029 nsize = len + pairs + (byteorder == 0);
6030 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006031 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006035 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006036 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006037 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006038 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006039 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006040 }
6041 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006042 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006043 }
Tim Peters772747b2001-08-09 22:21:55 +00006044
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006045 if (kind == PyUnicode_1BYTE_KIND) {
6046 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6047 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006048 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006049
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006050 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006051 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006052 }
6053 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006054 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006055 }
6056 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006057 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006058 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006059
6060 pos = 0;
6061 while (pos < len) {
6062 Py_ssize_t repsize, moreunits;
6063
6064 if (kind == PyUnicode_2BYTE_KIND) {
6065 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6066 &out, native_ordering);
6067 }
6068 else {
6069 assert(kind == PyUnicode_4BYTE_KIND);
6070 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6071 &out, native_ordering);
6072 }
6073 if (pos == len)
6074 break;
6075
6076 rep = unicode_encode_call_errorhandler(
6077 errors, &errorHandler,
6078 encoding, "surrogates not allowed",
6079 str, &exc, pos, pos + 1, &pos);
6080 if (!rep)
6081 goto error;
6082
6083 if (PyBytes_Check(rep)) {
6084 repsize = PyBytes_GET_SIZE(rep);
6085 if (repsize & 1) {
6086 raise_encode_exception(&exc, encoding,
6087 str, pos - 1, pos,
6088 "surrogates not allowed");
6089 goto error;
6090 }
6091 moreunits = repsize / 2;
6092 }
6093 else {
6094 assert(PyUnicode_Check(rep));
6095 if (PyUnicode_READY(rep) < 0)
6096 goto error;
6097 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6098 if (!PyUnicode_IS_ASCII(rep)) {
6099 raise_encode_exception(&exc, encoding,
6100 str, pos - 1, pos,
6101 "surrogates not allowed");
6102 goto error;
6103 }
6104 }
6105
6106 /* two bytes are reserved for each surrogate */
6107 if (moreunits > 1) {
6108 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006109 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006110 /* integer overflow */
6111 PyErr_NoMemory();
6112 goto error;
6113 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006114 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006115 goto error;
6116 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6117 }
6118
6119 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006120 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006121 out += moreunits;
6122 } else /* rep is unicode */ {
6123 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6124 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6125 &out, native_ordering);
6126 }
6127
6128 Py_CLEAR(rep);
6129 }
6130
6131 /* Cut back to size actually needed. This is necessary for, for example,
6132 encoding of a string containing isolated surrogates and the 'ignore' handler
6133 is used. */
6134 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6135 if (nsize != PyBytes_GET_SIZE(v))
6136 _PyBytes_Resize(&v, nsize);
6137 Py_XDECREF(errorHandler);
6138 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006139 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006140 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006141 error:
6142 Py_XDECREF(rep);
6143 Py_XDECREF(errorHandler);
6144 Py_XDECREF(exc);
6145 Py_XDECREF(v);
6146 return NULL;
6147#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148}
6149
Alexander Belopolsky40018472011-02-26 01:02:56 +00006150PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6152 Py_ssize_t size,
6153 const char *errors,
6154 int byteorder)
6155{
6156 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006157 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (tmp == NULL)
6159 return NULL;
6160 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6161 Py_DECREF(tmp);
6162 return result;
6163}
6164
6165PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006166PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169}
6170
6171/* --- Unicode Escape Codec ----------------------------------------------- */
6172
Fredrik Lundh06d12682001-01-24 07:59:11 +00006173static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006174
Alexander Belopolsky40018472011-02-26 01:02:56 +00006175PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006176_PyUnicode_DecodeUnicodeEscape(const char *s,
6177 Py_ssize_t size,
6178 const char *errors,
6179 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006182 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 PyObject *errorHandler = NULL;
6185 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006186
Eric V. Smith42454af2016-10-31 09:22:08 -04006187 // so we can remember if we've seen an invalid escape char or not
6188 *first_invalid_escape = NULL;
6189
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006191 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006192 }
6193 /* Escaped strings will always be longer than the resulting
6194 Unicode string, so we start with size here and then reduce the
6195 length after conversion to the true value.
6196 (but if the error callback returns a long replacement string
6197 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006198 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 writer.min_length = size;
6200 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6201 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006202 }
6203
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204 end = s + size;
6205 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 unsigned char c = (unsigned char) *s++;
6207 Py_UCS4 ch;
6208 int count;
6209 Py_ssize_t startinpos;
6210 Py_ssize_t endinpos;
6211 const char *message;
6212
6213#define WRITE_ASCII_CHAR(ch) \
6214 do { \
6215 assert(ch <= 127); \
6216 assert(writer.pos < writer.size); \
6217 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6218 } while(0)
6219
6220#define WRITE_CHAR(ch) \
6221 do { \
6222 if (ch <= writer.maxchar) { \
6223 assert(writer.pos < writer.size); \
6224 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6225 } \
6226 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6227 goto onError; \
6228 } \
6229 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230
6231 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 if (c != '\\') {
6233 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 continue;
6235 }
6236
Victor Stinner62ec3312016-09-06 17:04:34 -07006237 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 if (s >= end) {
6240 message = "\\ at end of string";
6241 goto error;
6242 }
6243 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006244
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006246 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 case '\n': continue;
6250 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6251 case '\'': WRITE_ASCII_CHAR('\''); continue;
6252 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6253 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006254 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6256 case 't': WRITE_ASCII_CHAR('\t'); continue;
6257 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6258 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006259 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006261 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006262 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 case '0': case '1': case '2': case '3':
6266 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006268 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006269 ch = (ch<<3) + *s++ - '0';
6270 if (s < end && '0' <= *s && *s <= '7') {
6271 ch = (ch<<3) + *s++ - '0';
6272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 WRITE_CHAR(ch);
6275 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 /* hex escapes */
6278 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006281 message = "truncated \\xXX escape";
6282 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006287 message = "truncated \\uXXXX escape";
6288 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006291 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006292 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006293 message = "truncated \\UXXXXXXXX escape";
6294 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006296 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 ch <<= 4;
6298 if (c >= '0' && c <= '9') {
6299 ch += c - '0';
6300 }
6301 else if (c >= 'a' && c <= 'f') {
6302 ch += c - ('a' - 10);
6303 }
6304 else if (c >= 'A' && c <= 'F') {
6305 ch += c - ('A' - 10);
6306 }
6307 else {
6308 break;
6309 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006310 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006312 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 }
6314
6315 /* when we get here, ch is a 32-bit unicode character */
6316 if (ch > MAX_UNICODE) {
6317 message = "illegal Unicode character";
6318 goto error;
6319 }
6320
6321 WRITE_CHAR(ch);
6322 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006323
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006325 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006326 if (ucnhash_CAPI == NULL) {
6327 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006328 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6329 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (ucnhash_CAPI == NULL) {
6331 PyErr_SetString(
6332 PyExc_UnicodeError,
6333 "\\N escapes not supported (can't load unicodedata module)"
6334 );
6335 goto onError;
6336 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006337 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006338
6339 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006340 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 const char *start = ++s;
6342 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006343 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006345 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 namelen = s - start;
6347 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006348 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006349 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006350 ch = 0xffffffff; /* in case 'getcode' messes up */
6351 if (namelen <= INT_MAX &&
6352 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6353 &ch, 0)) {
6354 assert(ch <= MAX_UNICODE);
6355 WRITE_CHAR(ch);
6356 continue;
6357 }
6358 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006359 }
6360 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006361 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006362
6363 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006364 if (*first_invalid_escape == NULL) {
6365 *first_invalid_escape = s-1; /* Back up one char, since we've
6366 already incremented s. */
6367 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006368 WRITE_ASCII_CHAR('\\');
6369 WRITE_CHAR(c);
6370 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006372
6373 error:
6374 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006375 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006376 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006377 errors, &errorHandler,
6378 "unicodeescape", message,
6379 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006381 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006382 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006383 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006384
6385#undef WRITE_ASCII_CHAR
6386#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006388
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006389 Py_XDECREF(errorHandler);
6390 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006391 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006392
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006394 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395 Py_XDECREF(errorHandler);
6396 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 return NULL;
6398}
6399
Eric V. Smith42454af2016-10-31 09:22:08 -04006400PyObject *
6401PyUnicode_DecodeUnicodeEscape(const char *s,
6402 Py_ssize_t size,
6403 const char *errors)
6404{
6405 const char *first_invalid_escape;
6406 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6407 &first_invalid_escape);
6408 if (result == NULL)
6409 return NULL;
6410 if (first_invalid_escape != NULL) {
6411 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6412 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006413 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006414 Py_DECREF(result);
6415 return NULL;
6416 }
6417 }
6418 return result;
6419}
6420
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006421/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422
Alexander Belopolsky40018472011-02-26 01:02:56 +00006423PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432
Ezio Melottie7f90372012-10-05 03:33:31 +03006433 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006434 escape.
6435
Ezio Melottie7f90372012-10-05 03:33:31 +03006436 For UCS1 strings it's '\xxx', 4 bytes per source character.
6437 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6438 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006439 */
6440
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006441 if (!PyUnicode_Check(unicode)) {
6442 PyErr_BadArgument();
6443 return NULL;
6444 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006446 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 }
Victor Stinner358af132015-10-12 22:36:57 +02006448
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 if (len == 0) {
6451 return PyBytes_FromStringAndSize(NULL, 0);
6452 }
6453
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006454 kind = PyUnicode_KIND(unicode);
6455 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006456 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6457 bytes, and 1 byte characters 4. */
6458 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006459 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 return PyErr_NoMemory();
6461 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006462 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006463 if (repr == NULL) {
6464 return NULL;
6465 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006466
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006468 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006469 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006470
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 /* U+0000-U+00ff range */
6472 if (ch < 0x100) {
6473 if (ch >= ' ' && ch < 127) {
6474 if (ch != '\\') {
6475 /* Copy printable US ASCII as-is */
6476 *p++ = (char) ch;
6477 }
6478 /* Escape backslashes */
6479 else {
6480 *p++ = '\\';
6481 *p++ = '\\';
6482 }
6483 }
Victor Stinner358af132015-10-12 22:36:57 +02006484
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 /* Map special whitespace to '\t', \n', '\r' */
6486 else if (ch == '\t') {
6487 *p++ = '\\';
6488 *p++ = 't';
6489 }
6490 else if (ch == '\n') {
6491 *p++ = '\\';
6492 *p++ = 'n';
6493 }
6494 else if (ch == '\r') {
6495 *p++ = '\\';
6496 *p++ = 'r';
6497 }
6498
6499 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6500 else {
6501 *p++ = '\\';
6502 *p++ = 'x';
6503 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6504 *p++ = Py_hexdigits[ch & 0x000F];
6505 }
Tim Petersced69f82003-09-16 20:30:58 +00006506 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006507 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 *p++ = '\\';
6510 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006511 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6512 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6513 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6514 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6517 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006518
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 /* Make sure that the first two digits are zero */
6520 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006521 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006522 *p++ = 'U';
6523 *p++ = '0';
6524 *p++ = '0';
6525 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6526 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6527 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6528 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6529 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6530 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
Victor Stinner62ec3312016-09-06 17:04:34 -07006534 assert(p - PyBytes_AS_STRING(repr) > 0);
6535 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6536 return NULL;
6537 }
6538 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
Alexander Belopolsky40018472011-02-26 01:02:56 +00006541PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006542PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6543 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006545 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006546 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006547 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006549 }
6550
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006551 result = PyUnicode_AsUnicodeEscapeString(tmp);
6552 Py_DECREF(tmp);
6553 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554}
6555
6556/* --- Raw Unicode Escape Codec ------------------------------------------- */
6557
Alexander Belopolsky40018472011-02-26 01:02:56 +00006558PyObject *
6559PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006560 Py_ssize_t size,
6561 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006563 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006564 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 PyObject *errorHandler = NULL;
6567 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006568
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006570 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006572
Guido van Rossumd57fd912000-03-10 22:53:23 +00006573 /* Escaped strings will always be longer than the resulting
6574 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006575 length after conversion to the true value. (But decoding error
6576 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006577 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006578 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6580 goto onError;
6581 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006582
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 end = s + size;
6584 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 unsigned char c = (unsigned char) *s++;
6586 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006587 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006588 Py_ssize_t startinpos;
6589 Py_ssize_t endinpos;
6590 const char *message;
6591
6592#define WRITE_CHAR(ch) \
6593 do { \
6594 if (ch <= writer.maxchar) { \
6595 assert(writer.pos < writer.size); \
6596 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6597 } \
6598 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6599 goto onError; \
6600 } \
6601 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006604 if (c != '\\' || s >= end) {
6605 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006608
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 c = (unsigned char) *s++;
6610 if (c == 'u') {
6611 count = 4;
6612 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006613 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006614 else if (c == 'U') {
6615 count = 8;
6616 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006617 }
6618 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006619 assert(writer.pos < writer.size);
6620 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6621 WRITE_CHAR(c);
6622 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006623 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 startinpos = s - starts - 2;
6625
6626 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6627 for (ch = 0; count && s < end; ++s, --count) {
6628 c = (unsigned char)*s;
6629 ch <<= 4;
6630 if (c >= '0' && c <= '9') {
6631 ch += c - '0';
6632 }
6633 else if (c >= 'a' && c <= 'f') {
6634 ch += c - ('a' - 10);
6635 }
6636 else if (c >= 'A' && c <= 'F') {
6637 ch += c - ('A' - 10);
6638 }
6639 else {
6640 break;
6641 }
6642 }
6643 if (!count) {
6644 if (ch <= MAX_UNICODE) {
6645 WRITE_CHAR(ch);
6646 continue;
6647 }
6648 message = "\\Uxxxxxxxx out of range";
6649 }
6650
6651 endinpos = s-starts;
6652 writer.min_length = end - s + writer.pos;
6653 if (unicode_decode_call_errorhandler_writer(
6654 errors, &errorHandler,
6655 "rawunicodeescape", message,
6656 &starts, &end, &startinpos, &endinpos, &exc, &s,
6657 &writer)) {
6658 goto onError;
6659 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006660 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006661
6662#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 Py_XDECREF(errorHandler);
6665 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006666 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006667
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006669 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 Py_XDECREF(errorHandler);
6671 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006673
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674}
6675
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006676
Alexander Belopolsky40018472011-02-26 01:02:56 +00006677PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006678PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679{
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006682 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006683 int kind;
6684 void *data;
6685 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006687 if (!PyUnicode_Check(unicode)) {
6688 PyErr_BadArgument();
6689 return NULL;
6690 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006692 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006693 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006694 kind = PyUnicode_KIND(unicode);
6695 data = PyUnicode_DATA(unicode);
6696 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006697 if (kind == PyUnicode_1BYTE_KIND) {
6698 return PyBytes_FromStringAndSize(data, len);
6699 }
Victor Stinner0e368262011-11-10 20:12:49 +01006700
Victor Stinner62ec3312016-09-06 17:04:34 -07006701 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6702 bytes, and 1 byte characters 4. */
6703 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006704
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 if (len > PY_SSIZE_T_MAX / expandsize) {
6706 return PyErr_NoMemory();
6707 }
6708 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6709 if (repr == NULL) {
6710 return NULL;
6711 }
6712 if (len == 0) {
6713 return repr;
6714 }
6715
6716 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006717 for (pos = 0; pos < len; pos++) {
6718 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006719
Victor Stinner62ec3312016-09-06 17:04:34 -07006720 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6721 if (ch < 0x100) {
6722 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006723 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006724 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006725 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 *p++ = '\\';
6727 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006728 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6729 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6730 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6731 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006733 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6734 else {
6735 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6736 *p++ = '\\';
6737 *p++ = 'U';
6738 *p++ = '0';
6739 *p++ = '0';
6740 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6741 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6742 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6743 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6744 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6745 *p++ = Py_hexdigits[ch & 15];
6746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006748
Victor Stinner62ec3312016-09-06 17:04:34 -07006749 assert(p > PyBytes_AS_STRING(repr));
6750 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6751 return NULL;
6752 }
6753 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754}
6755
Alexander Belopolsky40018472011-02-26 01:02:56 +00006756PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006757PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6758 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006760 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006761 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006762 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006763 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006764 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6765 Py_DECREF(tmp);
6766 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767}
6768
6769/* --- Latin-1 Codec ------------------------------------------------------ */
6770
Alexander Belopolsky40018472011-02-26 01:02:56 +00006771PyObject *
6772PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006773 Py_ssize_t size,
6774 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006777 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778}
6779
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006781static void
6782make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006783 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006784 PyObject *unicode,
6785 Py_ssize_t startpos, Py_ssize_t endpos,
6786 const char *reason)
6787{
6788 if (*exceptionObject == NULL) {
6789 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006791 encoding, unicode, startpos, endpos, reason);
6792 }
6793 else {
6794 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6795 goto onError;
6796 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6797 goto onError;
6798 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6799 goto onError;
6800 return;
6801 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006802 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006803 }
6804}
6805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006807static void
6808raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006809 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006810 PyObject *unicode,
6811 Py_ssize_t startpos, Py_ssize_t endpos,
6812 const char *reason)
6813{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006814 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006815 encoding, unicode, startpos, endpos, reason);
6816 if (*exceptionObject != NULL)
6817 PyCodec_StrictErrors(*exceptionObject);
6818}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006819
6820/* error handling callback helper:
6821 build arguments, call the callback and check the arguments,
6822 put the result into newpos and return the replacement string, which
6823 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006824static PyObject *
6825unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006826 PyObject **errorHandler,
6827 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006828 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006829 Py_ssize_t startpos, Py_ssize_t endpos,
6830 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006832 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006833 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834 PyObject *restuple;
6835 PyObject *resunicode;
6836
6837 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006839 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006841 }
6842
Benjamin Petersonbac79492012-01-14 13:34:47 -05006843 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 return NULL;
6845 len = PyUnicode_GET_LENGTH(unicode);
6846
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006847 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851
Petr Viktorinffd97532020-02-11 17:46:57 +01006852 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006854 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006855 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006856 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 Py_DECREF(restuple);
6858 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006860 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 &resunicode, newpos)) {
6862 Py_DECREF(restuple);
6863 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006864 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006865 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6866 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6867 Py_DECREF(restuple);
6868 return NULL;
6869 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006871 *newpos = len + *newpos;
6872 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006873 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 Py_DECREF(restuple);
6875 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006876 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877 Py_INCREF(resunicode);
6878 Py_DECREF(restuple);
6879 return resunicode;
6880}
6881
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006883unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006884 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006885 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006886{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 /* input state */
6888 Py_ssize_t pos=0, size;
6889 int kind;
6890 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006891 /* pointer into the output */
6892 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006893 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6894 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006895 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006896 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006897 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006898 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006899 /* output object */
6900 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006901
Benjamin Petersonbac79492012-01-14 13:34:47 -05006902 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006903 return NULL;
6904 size = PyUnicode_GET_LENGTH(unicode);
6905 kind = PyUnicode_KIND(unicode);
6906 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006907 /* allocate enough for a simple encoding without
6908 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006909 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006910 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006911
6912 _PyBytesWriter_Init(&writer);
6913 str = _PyBytesWriter_Alloc(&writer, size);
6914 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006916
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006917 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006918 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006921 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006923 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006925 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006927 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006929 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006930 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006932
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006933 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006934 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006935
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006936 /* Only overallocate the buffer if it's not the last write */
6937 writer.overallocate = (collend < size);
6938
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006940 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006941 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006942
6943 switch (error_handler) {
6944 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006945 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006947
6948 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006949 memset(str, '?', collend - collstart);
6950 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006951 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006952 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006953 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 break;
Victor Stinner50149202015-09-22 00:26:54 +02006955
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006956 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006957 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006958 writer.min_size -= (collend - collstart);
6959 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006960 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006961 if (str == NULL)
6962 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006963 pos = collend;
6964 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006965
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006966 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006967 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006968 writer.min_size -= (collend - collstart);
6969 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006970 unicode, collstart, collend);
6971 if (str == NULL)
6972 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006973 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 break;
Victor Stinner50149202015-09-22 00:26:54 +02006975
Victor Stinnerc3713e92015-09-29 12:32:13 +02006976 case _Py_ERROR_SURROGATEESCAPE:
6977 for (i = collstart; i < collend; ++i) {
6978 ch = PyUnicode_READ(kind, data, i);
6979 if (ch < 0xdc80 || 0xdcff < ch) {
6980 /* Not a UTF-8b surrogate */
6981 break;
6982 }
6983 *str++ = (char)(ch - 0xdc00);
6984 ++pos;
6985 }
6986 if (i >= collend)
6987 break;
6988 collstart = pos;
6989 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006990 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006991
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006993 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6994 encoding, reason, unicode, &exc,
6995 collstart, collend, &newpos);
6996 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006997 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006998
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006999 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007000 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007001
Victor Stinner6bd525b2015-10-09 13:10:05 +02007002 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007003 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007004 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007005 PyBytes_AS_STRING(rep),
7006 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007007 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007008 else {
7009 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007010
Victor Stinner6bd525b2015-10-09 13:10:05 +02007011 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007013
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007014 if (limit == 256 ?
7015 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7016 !PyUnicode_IS_ASCII(rep))
7017 {
7018 /* Not all characters are smaller than limit */
7019 raise_encode_exception(&exc, encoding, unicode,
7020 collstart, collend, reason);
7021 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007023 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7024 str = _PyBytesWriter_WriteBytes(&writer, str,
7025 PyUnicode_DATA(rep),
7026 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007028 if (str == NULL)
7029 goto onError;
7030
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007031 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007032 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007033 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007034
7035 /* If overallocation was disabled, ensure that it was the last
7036 write. Otherwise, we missed an optimization */
7037 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007038 }
7039 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007040
Victor Stinner50149202015-09-22 00:26:54 +02007041 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007042 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007043 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007044
7045 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007046 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007047 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007048 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007049 Py_XDECREF(exc);
7050 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051}
7052
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007054PyObject *
7055PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007056 Py_ssize_t size,
7057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007059 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007060 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007061 if (unicode == NULL)
7062 return NULL;
7063 result = unicode_encode_ucs1(unicode, errors, 256);
7064 Py_DECREF(unicode);
7065 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066}
7067
Alexander Belopolsky40018472011-02-26 01:02:56 +00007068PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007069_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070{
7071 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 PyErr_BadArgument();
7073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007075 if (PyUnicode_READY(unicode) == -1)
7076 return NULL;
7077 /* Fast path: if it is a one-byte string, construct
7078 bytes object directly. */
7079 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7080 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7081 PyUnicode_GET_LENGTH(unicode));
7082 /* Non-Latin-1 characters present. Defer to above function to
7083 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007084 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007085}
7086
7087PyObject*
7088PyUnicode_AsLatin1String(PyObject *unicode)
7089{
7090 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091}
7092
7093/* --- 7-bit ASCII Codec -------------------------------------------------- */
7094
Alexander Belopolsky40018472011-02-26 01:02:56 +00007095PyObject *
7096PyUnicode_DecodeASCII(const char *s,
7097 Py_ssize_t size,
7098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007101 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007102 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007103 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007104 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007105
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007107 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007108
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007110 if (size == 1 && (unsigned char)s[0] < 128)
7111 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112
Inada Naoki770847a2019-06-24 12:30:24 +09007113 // Shortcut for simple case
7114 PyObject *u = PyUnicode_New(size, 127);
7115 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007116 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007117 }
7118 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7119 if (outpos == size) {
7120 return u;
7121 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007122
Inada Naoki770847a2019-06-24 12:30:24 +09007123 _PyUnicodeWriter writer;
7124 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007125 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007126
Inada Naoki770847a2019-06-24 12:30:24 +09007127 s += outpos;
7128 int kind = writer.kind;
7129 void *data = writer.data;
7130 Py_ssize_t startinpos, endinpos;
7131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007132 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007133 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007135 PyUnicode_WRITE(kind, data, writer.pos, c);
7136 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007138 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007140
7141 /* byte outsize range 0x00..0x7f: call the error handler */
7142
7143 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007144 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007145
7146 switch (error_handler)
7147 {
7148 case _Py_ERROR_REPLACE:
7149 case _Py_ERROR_SURROGATEESCAPE:
7150 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007151 but we may switch to UCS2 at the first write */
7152 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7153 goto onError;
7154 kind = writer.kind;
7155 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007156
7157 if (error_handler == _Py_ERROR_REPLACE)
7158 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7159 else
7160 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7161 writer.pos++;
7162 ++s;
7163 break;
7164
7165 case _Py_ERROR_IGNORE:
7166 ++s;
7167 break;
7168
7169 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 startinpos = s-starts;
7171 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007172 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007173 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 "ascii", "ordinal not in range(128)",
7175 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007176 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007178 kind = writer.kind;
7179 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007182 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007183 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007184 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007185
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007187 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007188 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 return NULL;
7191}
7192
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007193/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007194PyObject *
7195PyUnicode_EncodeASCII(const Py_UNICODE *p,
7196 Py_ssize_t size,
7197 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007199 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007200 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007201 if (unicode == NULL)
7202 return NULL;
7203 result = unicode_encode_ucs1(unicode, errors, 128);
7204 Py_DECREF(unicode);
7205 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206}
7207
Alexander Belopolsky40018472011-02-26 01:02:56 +00007208PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007209_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210{
7211 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 PyErr_BadArgument();
7213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007215 if (PyUnicode_READY(unicode) == -1)
7216 return NULL;
7217 /* Fast path: if it is an ASCII-only string, construct bytes object
7218 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007219 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007220 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7221 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007222 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223}
7224
7225PyObject *
7226PyUnicode_AsASCIIString(PyObject *unicode)
7227{
7228 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229}
7230
Steve Dowercc16be82016-09-08 10:35:16 -07007231#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007232
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007233/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007234
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007235#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007236#define NEED_RETRY
7237#endif
7238
Steve Dower7ebdda02019-08-21 16:22:33 -07007239/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7240 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7241 both cases also and avoids partial characters overrunning the
7242 length limit in MultiByteToWideChar on Windows */
7243#define DECODING_CHUNK_SIZE (INT_MAX/4)
7244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245#ifndef WC_ERR_INVALID_CHARS
7246# define WC_ERR_INVALID_CHARS 0x0080
7247#endif
7248
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007249static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007250code_page_name(UINT code_page, PyObject **obj)
7251{
7252 *obj = NULL;
7253 if (code_page == CP_ACP)
7254 return "mbcs";
7255 if (code_page == CP_UTF7)
7256 return "CP_UTF7";
7257 if (code_page == CP_UTF8)
7258 return "CP_UTF8";
7259
7260 *obj = PyBytes_FromFormat("cp%u", code_page);
7261 if (*obj == NULL)
7262 return NULL;
7263 return PyBytes_AS_STRING(*obj);
7264}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265
Victor Stinner3a50e702011-10-18 21:21:00 +02007266static DWORD
7267decode_code_page_flags(UINT code_page)
7268{
7269 if (code_page == CP_UTF7) {
7270 /* The CP_UTF7 decoder only supports flags=0 */
7271 return 0;
7272 }
7273 else
7274 return MB_ERR_INVALID_CHARS;
7275}
7276
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007277/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 * Decode a byte string from a Windows code page into unicode object in strict
7279 * mode.
7280 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007281 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7282 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007284static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007285decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007286 wchar_t **buf,
7287 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 const char *in,
7289 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007291 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007292 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007294
7295 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007297 while ((outsize = MultiByteToWideChar(code_page, flags,
7298 in, insize, NULL, 0)) <= 0)
7299 {
7300 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7301 goto error;
7302 }
7303 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7304 flags = 0;
7305 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007306
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007307 /* Extend a wchar_t* buffer */
7308 Py_ssize_t n = *bufsize; /* Get the current length */
7309 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7310 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007312 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007313
7314 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7316 if (outsize <= 0)
7317 goto error;
7318 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007319
Victor Stinner3a50e702011-10-18 21:21:00 +02007320error:
7321 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7322 return -2;
7323 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007324 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325}
7326
Victor Stinner3a50e702011-10-18 21:21:00 +02007327/*
7328 * Decode a byte string from a code page into unicode object with an error
7329 * handler.
7330 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007331 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 * UnicodeDecodeError exception and returns -1 on error.
7333 */
7334static int
7335decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007336 wchar_t **buf,
7337 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007339 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007340{
7341 const char *startin = in;
7342 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007343 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007344 /* Ideally, we should get reason from FormatMessage. This is the Windows
7345 2000 English version of the message. */
7346 const char *reason = "No mapping for the Unicode character exists "
7347 "in the target code page.";
7348 /* each step cannot decode more than 1 character, but a character can be
7349 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007350 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007351 int insize;
7352 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 PyObject *errorHandler = NULL;
7354 PyObject *exc = NULL;
7355 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007356 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 DWORD err;
7358 int ret = -1;
7359
7360 assert(size > 0);
7361
7362 encoding = code_page_name(code_page, &encoding_obj);
7363 if (encoding == NULL)
7364 return -1;
7365
Victor Stinner7d00cc12014-03-17 23:08:06 +01007366 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7368 UnicodeDecodeError. */
7369 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7370 if (exc != NULL) {
7371 PyCodec_StrictErrors(exc);
7372 Py_CLEAR(exc);
7373 }
7374 goto error;
7375 }
7376
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007377 /* Extend a wchar_t* buffer */
7378 Py_ssize_t n = *bufsize; /* Get the current length */
7379 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7380 PyErr_NoMemory();
7381 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7384 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007386 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387
7388 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 while (in < endin)
7390 {
7391 /* Decode a character */
7392 insize = 1;
7393 do
7394 {
7395 outsize = MultiByteToWideChar(code_page, flags,
7396 in, insize,
7397 buffer, Py_ARRAY_LENGTH(buffer));
7398 if (outsize > 0)
7399 break;
7400 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007401 if (err == ERROR_INVALID_FLAGS && flags) {
7402 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7403 flags = 0;
7404 continue;
7405 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (err != ERROR_NO_UNICODE_TRANSLATION
7407 && err != ERROR_INSUFFICIENT_BUFFER)
7408 {
7409 PyErr_SetFromWindowsErr(0);
7410 goto error;
7411 }
7412 insize++;
7413 }
7414 /* 4=maximum length of a UTF-8 sequence */
7415 while (insize <= 4 && (in + insize) <= endin);
7416
7417 if (outsize <= 0) {
7418 Py_ssize_t startinpos, endinpos, outpos;
7419
Victor Stinner7d00cc12014-03-17 23:08:06 +01007420 /* last character in partial decode? */
7421 if (in + insize >= endin && !final)
7422 break;
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424 startinpos = in - startin;
7425 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007426 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007427 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007428 errors, &errorHandler,
7429 encoding, reason,
7430 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007431 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 {
7433 goto error;
7434 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007435 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 }
7437 else {
7438 in += insize;
7439 memcpy(out, buffer, outsize * sizeof(wchar_t));
7440 out += outsize;
7441 }
7442 }
7443
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007444 /* Shrink the buffer */
7445 assert(out - *buf <= *bufsize);
7446 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007447 /* (in - startin) <= size and size is an int */
7448 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007449
7450error:
7451 Py_XDECREF(encoding_obj);
7452 Py_XDECREF(errorHandler);
7453 Py_XDECREF(exc);
7454 return ret;
7455}
7456
Victor Stinner3a50e702011-10-18 21:21:00 +02007457static PyObject *
7458decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 const char *s, Py_ssize_t size,
7460 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007461{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007462 wchar_t *buf = NULL;
7463 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007465
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 if (code_page < 0) {
7467 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7468 return NULL;
7469 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007470 if (size < 0) {
7471 PyErr_BadInternalCall();
7472 return NULL;
7473 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007474
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477
Victor Stinner76a31a62011-11-04 00:05:13 +01007478 do
7479 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007481 if (size > DECODING_CHUNK_SIZE) {
7482 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007483 final = 0;
7484 done = 0;
7485 }
7486 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007487#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007488 {
7489 chunk_size = (int)size;
7490 final = (consumed == NULL);
7491 done = 1;
7492 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493
Victor Stinner76a31a62011-11-04 00:05:13 +01007494 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007495 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007496 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007497 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007498 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007499
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007500 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007501 s, chunk_size);
7502 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007503 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007504 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007505 errors, final);
7506 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007507
7508 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007509 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007510 return NULL;
7511 }
7512
7513 if (consumed)
7514 *consumed += converted;
7515
7516 s += converted;
7517 size -= converted;
7518 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007519
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007520 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7521 PyMem_Free(buf);
7522 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007523}
7524
Alexander Belopolsky40018472011-02-26 01:02:56 +00007525PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007526PyUnicode_DecodeCodePageStateful(int code_page,
7527 const char *s,
7528 Py_ssize_t size,
7529 const char *errors,
7530 Py_ssize_t *consumed)
7531{
7532 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7533}
7534
7535PyObject *
7536PyUnicode_DecodeMBCSStateful(const char *s,
7537 Py_ssize_t size,
7538 const char *errors,
7539 Py_ssize_t *consumed)
7540{
7541 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7542}
7543
7544PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007545PyUnicode_DecodeMBCS(const char *s,
7546 Py_ssize_t size,
7547 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007548{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7550}
7551
Victor Stinner3a50e702011-10-18 21:21:00 +02007552static DWORD
7553encode_code_page_flags(UINT code_page, const char *errors)
7554{
7555 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007556 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 }
7558 else if (code_page == CP_UTF7) {
7559 /* CP_UTF7 only supports flags=0 */
7560 return 0;
7561 }
7562 else {
7563 if (errors != NULL && strcmp(errors, "replace") == 0)
7564 return 0;
7565 else
7566 return WC_NO_BEST_FIT_CHARS;
7567 }
7568}
7569
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007570/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 * Encode a Unicode string to a Windows code page into a byte string in strict
7572 * mode.
7573 *
7574 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007575 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007576 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007577static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007578encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007579 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581{
Victor Stinner554f3f02010-06-16 23:33:54 +00007582 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 BOOL *pusedDefaultChar = &usedDefaultChar;
7584 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007585 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 const DWORD flags = encode_code_page_flags(code_page, NULL);
7588 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007589 /* Create a substring so that we can get the UTF-16 representation
7590 of just the slice under consideration. */
7591 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592
Martin v. Löwis3d325192011-11-04 18:23:06 +01007593 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007594
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007596 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007597 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007598 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007599
Victor Stinner2fc507f2011-11-04 20:06:39 +01007600 substring = PyUnicode_Substring(unicode, offset, offset+len);
7601 if (substring == NULL)
7602 return -1;
7603 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7604 if (p == NULL) {
7605 Py_DECREF(substring);
7606 return -1;
7607 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007608 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007609
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007610 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007612 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 NULL, 0,
7614 NULL, pusedDefaultChar);
7615 if (outsize <= 0)
7616 goto error;
7617 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007618 if (pusedDefaultChar && *pusedDefaultChar) {
7619 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007621 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007622
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007626 if (*outbytes == NULL) {
7627 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007629 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007631 }
7632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 const Py_ssize_t n = PyBytes_Size(*outbytes);
7635 if (outsize > PY_SSIZE_T_MAX - n) {
7636 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007640 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7641 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007643 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645 }
7646
7647 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007649 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 out, outsize,
7651 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007652 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 if (outsize <= 0)
7654 goto error;
7655 if (pusedDefaultChar && *pusedDefaultChar)
7656 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007658
Victor Stinner3a50e702011-10-18 21:21:00 +02007659error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007660 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007661 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7662 return -2;
7663 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007664 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007665}
7666
Victor Stinner3a50e702011-10-18 21:21:00 +02007667/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007668 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 * error handler.
7670 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007671 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 * -1 on other error.
7673 */
7674static int
7675encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007677 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007678{
Victor Stinner3a50e702011-10-18 21:21:00 +02007679 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007680 Py_ssize_t pos = unicode_offset;
7681 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007682 /* Ideally, we should get reason from FormatMessage. This is the Windows
7683 2000 English version of the message. */
7684 const char *reason = "invalid character";
7685 /* 4=maximum length of a UTF-8 sequence */
7686 char buffer[4];
7687 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7688 Py_ssize_t outsize;
7689 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 PyObject *errorHandler = NULL;
7691 PyObject *exc = NULL;
7692 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007693 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007694 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007695 PyObject *rep;
7696 int ret = -1;
7697
7698 assert(insize > 0);
7699
7700 encoding = code_page_name(code_page, &encoding_obj);
7701 if (encoding == NULL)
7702 return -1;
7703
7704 if (errors == NULL || strcmp(errors, "strict") == 0) {
7705 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7706 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007707 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 if (exc != NULL) {
7709 PyCodec_StrictErrors(exc);
7710 Py_DECREF(exc);
7711 }
7712 Py_XDECREF(encoding_obj);
7713 return -1;
7714 }
7715
7716 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7717 pusedDefaultChar = &usedDefaultChar;
7718 else
7719 pusedDefaultChar = NULL;
7720
7721 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7722 PyErr_NoMemory();
7723 goto error;
7724 }
7725 outsize = insize * Py_ARRAY_LENGTH(buffer);
7726
7727 if (*outbytes == NULL) {
7728 /* Create string object */
7729 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7730 if (*outbytes == NULL)
7731 goto error;
7732 out = PyBytes_AS_STRING(*outbytes);
7733 }
7734 else {
7735 /* Extend string object */
7736 Py_ssize_t n = PyBytes_Size(*outbytes);
7737 if (n > PY_SSIZE_T_MAX - outsize) {
7738 PyErr_NoMemory();
7739 goto error;
7740 }
7741 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7742 goto error;
7743 out = PyBytes_AS_STRING(*outbytes) + n;
7744 }
7745
7746 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007747 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007748 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007749 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7750 wchar_t chars[2];
7751 int charsize;
7752 if (ch < 0x10000) {
7753 chars[0] = (wchar_t)ch;
7754 charsize = 1;
7755 }
7756 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007757 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7758 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007759 charsize = 2;
7760 }
7761
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007763 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007764 buffer, Py_ARRAY_LENGTH(buffer),
7765 NULL, pusedDefaultChar);
7766 if (outsize > 0) {
7767 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7768 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007769 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007770 memcpy(out, buffer, outsize);
7771 out += outsize;
7772 continue;
7773 }
7774 }
7775 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7776 PyErr_SetFromWindowsErr(0);
7777 goto error;
7778 }
7779
Victor Stinner3a50e702011-10-18 21:21:00 +02007780 rep = unicode_encode_call_errorhandler(
7781 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007782 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007783 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007784 if (rep == NULL)
7785 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007786 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007787
7788 if (PyBytes_Check(rep)) {
7789 outsize = PyBytes_GET_SIZE(rep);
7790 if (outsize != 1) {
7791 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7792 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7793 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7794 Py_DECREF(rep);
7795 goto error;
7796 }
7797 out = PyBytes_AS_STRING(*outbytes) + offset;
7798 }
7799 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7800 out += outsize;
7801 }
7802 else {
7803 Py_ssize_t i;
7804 enum PyUnicode_Kind kind;
7805 void *data;
7806
Benjamin Petersonbac79492012-01-14 13:34:47 -05007807 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 Py_DECREF(rep);
7809 goto error;
7810 }
7811
7812 outsize = PyUnicode_GET_LENGTH(rep);
7813 if (outsize != 1) {
7814 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7815 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7816 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7817 Py_DECREF(rep);
7818 goto error;
7819 }
7820 out = PyBytes_AS_STRING(*outbytes) + offset;
7821 }
7822 kind = PyUnicode_KIND(rep);
7823 data = PyUnicode_DATA(rep);
7824 for (i=0; i < outsize; i++) {
7825 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7826 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007827 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007828 encoding, unicode,
7829 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 "unable to encode error handler result to ASCII");
7831 Py_DECREF(rep);
7832 goto error;
7833 }
7834 *out = (unsigned char)ch;
7835 out++;
7836 }
7837 }
7838 Py_DECREF(rep);
7839 }
7840 /* write a NUL byte */
7841 *out = 0;
7842 outsize = out - PyBytes_AS_STRING(*outbytes);
7843 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7844 if (_PyBytes_Resize(outbytes, outsize) < 0)
7845 goto error;
7846 ret = 0;
7847
7848error:
7849 Py_XDECREF(encoding_obj);
7850 Py_XDECREF(errorHandler);
7851 Py_XDECREF(exc);
7852 return ret;
7853}
7854
Victor Stinner3a50e702011-10-18 21:21:00 +02007855static PyObject *
7856encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007857 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007858 const char *errors)
7859{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007860 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007861 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007862 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007863 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007864
Victor Stinner29dacf22015-01-26 16:41:32 +01007865 if (!PyUnicode_Check(unicode)) {
7866 PyErr_BadArgument();
7867 return NULL;
7868 }
7869
Benjamin Petersonbac79492012-01-14 13:34:47 -05007870 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007871 return NULL;
7872 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007873
Victor Stinner3a50e702011-10-18 21:21:00 +02007874 if (code_page < 0) {
7875 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7876 return NULL;
7877 }
7878
Martin v. Löwis3d325192011-11-04 18:23:06 +01007879 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007880 return PyBytes_FromStringAndSize(NULL, 0);
7881
Victor Stinner7581cef2011-11-03 22:32:33 +01007882 offset = 0;
7883 do
7884 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007885#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007886 if (len > DECODING_CHUNK_SIZE) {
7887 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007888 done = 0;
7889 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007890 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007891#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007892 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007893 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007894 done = 1;
7895 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007896
Victor Stinner76a31a62011-11-04 00:05:13 +01007897 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007898 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007899 errors);
7900 if (ret == -2)
7901 ret = encode_code_page_errors(code_page, &outbytes,
7902 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007903 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007904 if (ret < 0) {
7905 Py_XDECREF(outbytes);
7906 return NULL;
7907 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007908
Victor Stinner7581cef2011-11-03 22:32:33 +01007909 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007910 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007911 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007912
Victor Stinner3a50e702011-10-18 21:21:00 +02007913 return outbytes;
7914}
7915
7916PyObject *
7917PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7918 Py_ssize_t size,
7919 const char *errors)
7920{
Victor Stinner7581cef2011-11-03 22:32:33 +01007921 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007922 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007923 if (unicode == NULL)
7924 return NULL;
7925 res = encode_code_page(CP_ACP, unicode, errors);
7926 Py_DECREF(unicode);
7927 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007928}
7929
7930PyObject *
7931PyUnicode_EncodeCodePage(int code_page,
7932 PyObject *unicode,
7933 const char *errors)
7934{
Victor Stinner7581cef2011-11-03 22:32:33 +01007935 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007936}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007937
Alexander Belopolsky40018472011-02-26 01:02:56 +00007938PyObject *
7939PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007940{
Victor Stinner7581cef2011-11-03 22:32:33 +01007941 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007942}
7943
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007944#undef NEED_RETRY
7945
Steve Dowercc16be82016-09-08 10:35:16 -07007946#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007947
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948/* --- Character Mapping Codec -------------------------------------------- */
7949
Victor Stinnerfb161b12013-04-18 01:44:27 +02007950static int
7951charmap_decode_string(const char *s,
7952 Py_ssize_t size,
7953 PyObject *mapping,
7954 const char *errors,
7955 _PyUnicodeWriter *writer)
7956{
7957 const char *starts = s;
7958 const char *e;
7959 Py_ssize_t startinpos, endinpos;
7960 PyObject *errorHandler = NULL, *exc = NULL;
7961 Py_ssize_t maplen;
7962 enum PyUnicode_Kind mapkind;
7963 void *mapdata;
7964 Py_UCS4 x;
7965 unsigned char ch;
7966
7967 if (PyUnicode_READY(mapping) == -1)
7968 return -1;
7969
7970 maplen = PyUnicode_GET_LENGTH(mapping);
7971 mapdata = PyUnicode_DATA(mapping);
7972 mapkind = PyUnicode_KIND(mapping);
7973
7974 e = s + size;
7975
7976 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7977 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7978 * is disabled in encoding aliases, latin1 is preferred because
7979 * its implementation is faster. */
7980 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7981 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7982 Py_UCS4 maxchar = writer->maxchar;
7983
7984 assert (writer->kind == PyUnicode_1BYTE_KIND);
7985 while (s < e) {
7986 ch = *s;
7987 x = mapdata_ucs1[ch];
7988 if (x > maxchar) {
7989 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7990 goto onError;
7991 maxchar = writer->maxchar;
7992 outdata = (Py_UCS1 *)writer->data;
7993 }
7994 outdata[writer->pos] = x;
7995 writer->pos++;
7996 ++s;
7997 }
7998 return 0;
7999 }
8000
8001 while (s < e) {
8002 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8003 enum PyUnicode_Kind outkind = writer->kind;
8004 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
8005 if (outkind == PyUnicode_1BYTE_KIND) {
8006 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8007 Py_UCS4 maxchar = writer->maxchar;
8008 while (s < e) {
8009 ch = *s;
8010 x = mapdata_ucs2[ch];
8011 if (x > maxchar)
8012 goto Error;
8013 outdata[writer->pos] = x;
8014 writer->pos++;
8015 ++s;
8016 }
8017 break;
8018 }
8019 else if (outkind == PyUnicode_2BYTE_KIND) {
8020 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8021 while (s < e) {
8022 ch = *s;
8023 x = mapdata_ucs2[ch];
8024 if (x == 0xFFFE)
8025 goto Error;
8026 outdata[writer->pos] = x;
8027 writer->pos++;
8028 ++s;
8029 }
8030 break;
8031 }
8032 }
8033 ch = *s;
8034
8035 if (ch < maplen)
8036 x = PyUnicode_READ(mapkind, mapdata, ch);
8037 else
8038 x = 0xfffe; /* invalid value */
8039Error:
8040 if (x == 0xfffe)
8041 {
8042 /* undefined mapping */
8043 startinpos = s-starts;
8044 endinpos = startinpos+1;
8045 if (unicode_decode_call_errorhandler_writer(
8046 errors, &errorHandler,
8047 "charmap", "character maps to <undefined>",
8048 &starts, &e, &startinpos, &endinpos, &exc, &s,
8049 writer)) {
8050 goto onError;
8051 }
8052 continue;
8053 }
8054
8055 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8056 goto onError;
8057 ++s;
8058 }
8059 Py_XDECREF(errorHandler);
8060 Py_XDECREF(exc);
8061 return 0;
8062
8063onError:
8064 Py_XDECREF(errorHandler);
8065 Py_XDECREF(exc);
8066 return -1;
8067}
8068
8069static int
8070charmap_decode_mapping(const char *s,
8071 Py_ssize_t size,
8072 PyObject *mapping,
8073 const char *errors,
8074 _PyUnicodeWriter *writer)
8075{
8076 const char *starts = s;
8077 const char *e;
8078 Py_ssize_t startinpos, endinpos;
8079 PyObject *errorHandler = NULL, *exc = NULL;
8080 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008081 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008082
8083 e = s + size;
8084
8085 while (s < e) {
8086 ch = *s;
8087
8088 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8089 key = PyLong_FromLong((long)ch);
8090 if (key == NULL)
8091 goto onError;
8092
8093 item = PyObject_GetItem(mapping, key);
8094 Py_DECREF(key);
8095 if (item == NULL) {
8096 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8097 /* No mapping found means: mapping is undefined. */
8098 PyErr_Clear();
8099 goto Undefined;
8100 } else
8101 goto onError;
8102 }
8103
8104 /* Apply mapping */
8105 if (item == Py_None)
8106 goto Undefined;
8107 if (PyLong_Check(item)) {
8108 long value = PyLong_AS_LONG(item);
8109 if (value == 0xFFFE)
8110 goto Undefined;
8111 if (value < 0 || value > MAX_UNICODE) {
8112 PyErr_Format(PyExc_TypeError,
8113 "character mapping must be in range(0x%lx)",
8114 (unsigned long)MAX_UNICODE + 1);
8115 goto onError;
8116 }
8117
8118 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8119 goto onError;
8120 }
8121 else if (PyUnicode_Check(item)) {
8122 if (PyUnicode_READY(item) == -1)
8123 goto onError;
8124 if (PyUnicode_GET_LENGTH(item) == 1) {
8125 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8126 if (value == 0xFFFE)
8127 goto Undefined;
8128 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8129 goto onError;
8130 }
8131 else {
8132 writer->overallocate = 1;
8133 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8134 goto onError;
8135 }
8136 }
8137 else {
8138 /* wrong return value */
8139 PyErr_SetString(PyExc_TypeError,
8140 "character mapping must return integer, None or str");
8141 goto onError;
8142 }
8143 Py_CLEAR(item);
8144 ++s;
8145 continue;
8146
8147Undefined:
8148 /* undefined mapping */
8149 Py_CLEAR(item);
8150 startinpos = s-starts;
8151 endinpos = startinpos+1;
8152 if (unicode_decode_call_errorhandler_writer(
8153 errors, &errorHandler,
8154 "charmap", "character maps to <undefined>",
8155 &starts, &e, &startinpos, &endinpos, &exc, &s,
8156 writer)) {
8157 goto onError;
8158 }
8159 }
8160 Py_XDECREF(errorHandler);
8161 Py_XDECREF(exc);
8162 return 0;
8163
8164onError:
8165 Py_XDECREF(item);
8166 Py_XDECREF(errorHandler);
8167 Py_XDECREF(exc);
8168 return -1;
8169}
8170
Alexander Belopolsky40018472011-02-26 01:02:56 +00008171PyObject *
8172PyUnicode_DecodeCharmap(const char *s,
8173 Py_ssize_t size,
8174 PyObject *mapping,
8175 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008176{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008177 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008178
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179 /* Default to Latin-1 */
8180 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008184 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008185 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008186 writer.min_length = size;
8187 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008189
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008190 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008191 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8192 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008193 }
8194 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008195 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008198 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008199
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008201 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202 return NULL;
8203}
8204
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205/* Charmap encoding: the lookup table */
8206
Alexander Belopolsky40018472011-02-26 01:02:56 +00008207struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 PyObject_HEAD
8209 unsigned char level1[32];
8210 int count2, count3;
8211 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212};
8213
8214static PyObject*
8215encoding_map_size(PyObject *obj, PyObject* args)
8216{
8217 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220}
8221
8222static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 PyDoc_STR("Return the size (in bytes) of this object") },
8225 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226};
8227
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008228static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 "EncodingMap", /*tp_name*/
8231 sizeof(struct encoding_map), /*tp_basicsize*/
8232 0, /*tp_itemsize*/
8233 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008234 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008235 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 0, /*tp_getattr*/
8237 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008238 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 0, /*tp_repr*/
8240 0, /*tp_as_number*/
8241 0, /*tp_as_sequence*/
8242 0, /*tp_as_mapping*/
8243 0, /*tp_hash*/
8244 0, /*tp_call*/
8245 0, /*tp_str*/
8246 0, /*tp_getattro*/
8247 0, /*tp_setattro*/
8248 0, /*tp_as_buffer*/
8249 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8250 0, /*tp_doc*/
8251 0, /*tp_traverse*/
8252 0, /*tp_clear*/
8253 0, /*tp_richcompare*/
8254 0, /*tp_weaklistoffset*/
8255 0, /*tp_iter*/
8256 0, /*tp_iternext*/
8257 encoding_map_methods, /*tp_methods*/
8258 0, /*tp_members*/
8259 0, /*tp_getset*/
8260 0, /*tp_base*/
8261 0, /*tp_dict*/
8262 0, /*tp_descr_get*/
8263 0, /*tp_descr_set*/
8264 0, /*tp_dictoffset*/
8265 0, /*tp_init*/
8266 0, /*tp_alloc*/
8267 0, /*tp_new*/
8268 0, /*tp_free*/
8269 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270};
8271
8272PyObject*
8273PyUnicode_BuildEncodingMap(PyObject* string)
8274{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008275 PyObject *result;
8276 struct encoding_map *mresult;
8277 int i;
8278 int need_dict = 0;
8279 unsigned char level1[32];
8280 unsigned char level2[512];
8281 unsigned char *mlevel1, *mlevel2, *mlevel3;
8282 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 int kind;
8284 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008285 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008288 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289 PyErr_BadArgument();
8290 return NULL;
8291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008292 kind = PyUnicode_KIND(string);
8293 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008294 length = PyUnicode_GET_LENGTH(string);
8295 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008296 memset(level1, 0xFF, sizeof level1);
8297 memset(level2, 0xFF, sizeof level2);
8298
8299 /* If there isn't a one-to-one mapping of NULL to \0,
8300 or if there are non-BMP characters, we need to use
8301 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008304 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008305 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008306 ch = PyUnicode_READ(kind, data, i);
8307 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008308 need_dict = 1;
8309 break;
8310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 /* unmapped character */
8313 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 l1 = ch >> 11;
8315 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008316 if (level1[l1] == 0xFF)
8317 level1[l1] = count2++;
8318 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 }
8321
8322 if (count2 >= 0xFF || count3 >= 0xFF)
8323 need_dict = 1;
8324
8325 if (need_dict) {
8326 PyObject *result = PyDict_New();
8327 PyObject *key, *value;
8328 if (!result)
8329 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008330 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008332 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 if (!key || !value)
8334 goto failed1;
8335 if (PyDict_SetItem(result, key, value) == -1)
8336 goto failed1;
8337 Py_DECREF(key);
8338 Py_DECREF(value);
8339 }
8340 return result;
8341 failed1:
8342 Py_XDECREF(key);
8343 Py_XDECREF(value);
8344 Py_DECREF(result);
8345 return NULL;
8346 }
8347
8348 /* Create a three-level trie */
8349 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8350 16*count2 + 128*count3 - 1);
8351 if (!result)
8352 return PyErr_NoMemory();
8353 PyObject_Init(result, &EncodingMapType);
8354 mresult = (struct encoding_map*)result;
8355 mresult->count2 = count2;
8356 mresult->count3 = count3;
8357 mlevel1 = mresult->level1;
8358 mlevel2 = mresult->level23;
8359 mlevel3 = mresult->level23 + 16*count2;
8360 memcpy(mlevel1, level1, 32);
8361 memset(mlevel2, 0xFF, 16*count2);
8362 memset(mlevel3, 0, 128*count3);
8363 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008364 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008366 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8367 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 /* unmapped character */
8369 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008370 o1 = ch>>11;
8371 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 i2 = 16*mlevel1[o1] + o2;
8373 if (mlevel2[i2] == 0xFF)
8374 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008375 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 i3 = 128*mlevel2[i2] + o3;
8377 mlevel3[i3] = i;
8378 }
8379 return result;
8380}
8381
8382static int
Victor Stinner22168992011-11-20 17:09:18 +01008383encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384{
8385 struct encoding_map *map = (struct encoding_map*)mapping;
8386 int l1 = c>>11;
8387 int l2 = (c>>7) & 0xF;
8388 int l3 = c & 0x7F;
8389 int i;
8390
Victor Stinner22168992011-11-20 17:09:18 +01008391 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008393 if (c == 0)
8394 return 0;
8395 /* level 1*/
8396 i = map->level1[l1];
8397 if (i == 0xFF) {
8398 return -1;
8399 }
8400 /* level 2*/
8401 i = map->level23[16*i+l2];
8402 if (i == 0xFF) {
8403 return -1;
8404 }
8405 /* level 3 */
8406 i = map->level23[16*map->count2 + 128*i + l3];
8407 if (i == 0) {
8408 return -1;
8409 }
8410 return i;
8411}
8412
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413/* Lookup the character ch in the mapping. If the character
8414 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008415 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008416static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008417charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418{
Christian Heimes217cfd12007-12-02 14:31:20 +00008419 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 PyObject *x;
8421
8422 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 x = PyObject_GetItem(mapping, w);
8425 Py_DECREF(w);
8426 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8428 /* No mapping found means: mapping is undefined. */
8429 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008430 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 } else
8432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008434 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008436 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 long value = PyLong_AS_LONG(x);
8438 if (value < 0 || value > 255) {
8439 PyErr_SetString(PyExc_TypeError,
8440 "character mapping must be in range(256)");
8441 Py_DECREF(x);
8442 return NULL;
8443 }
8444 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008445 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008446 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008448 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 /* wrong return value */
8450 PyErr_Format(PyExc_TypeError,
8451 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008452 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 Py_DECREF(x);
8454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 }
8456}
8457
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008458static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008459charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008460{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008461 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8462 /* exponentially overallocate to minimize reallocations */
8463 if (requiredsize < 2*outsize)
8464 requiredsize = 2*outsize;
8465 if (_PyBytes_Resize(outobj, requiredsize))
8466 return -1;
8467 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008468}
8469
Benjamin Peterson14339b62009-01-31 16:36:08 +00008470typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008472} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008474 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 space is available. Return a new reference to the object that
8476 was put in the output buffer, or Py_None, if the mapping was undefined
8477 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008478 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008479static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008480charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008481 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008483 PyObject *rep;
8484 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008485 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486
Christian Heimes90aa7642007-12-19 02:45:37 +00008487 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008488 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008490 if (res == -1)
8491 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 if (outsize<requiredsize)
8493 if (charmapencode_resize(outobj, outpos, requiredsize))
8494 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008495 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 outstart[(*outpos)++] = (char)res;
8497 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008498 }
8499
8500 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 Py_DECREF(rep);
8505 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008506 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 if (PyLong_Check(rep)) {
8508 Py_ssize_t requiredsize = *outpos+1;
8509 if (outsize<requiredsize)
8510 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8511 Py_DECREF(rep);
8512 return enc_EXCEPTION;
8513 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008514 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 else {
8518 const char *repchars = PyBytes_AS_STRING(rep);
8519 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8520 Py_ssize_t requiredsize = *outpos+repsize;
8521 if (outsize<requiredsize)
8522 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8523 Py_DECREF(rep);
8524 return enc_EXCEPTION;
8525 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008526 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 memcpy(outstart + *outpos, repchars, repsize);
8528 *outpos += repsize;
8529 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008531 Py_DECREF(rep);
8532 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533}
8534
8535/* handle an error in PyUnicode_EncodeCharmap
8536 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008537static int
8538charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008541 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008542 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543{
8544 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008545 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008546 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008547 enum PyUnicode_Kind kind;
8548 void *data;
8549 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008551 Py_ssize_t collstartpos = *inpos;
8552 Py_ssize_t collendpos = *inpos+1;
8553 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008554 const char *encoding = "charmap";
8555 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008556 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008557 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008558 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559
Benjamin Petersonbac79492012-01-14 13:34:47 -05008560 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561 return -1;
8562 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 /* find all unencodable characters */
8564 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008565 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008566 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008568 val = encoding_map_lookup(ch, mapping);
8569 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 break;
8571 ++collendpos;
8572 continue;
8573 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008575 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8576 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 if (rep==NULL)
8578 return -1;
8579 else if (rep!=Py_None) {
8580 Py_DECREF(rep);
8581 break;
8582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008583 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 }
8586 /* cache callback name lookup
8587 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008588 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008589 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008590
8591 switch (*error_handler) {
8592 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008593 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008594 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008595
8596 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008597 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 x = charmapencode_output('?', mapping, res, respos);
8599 if (x==enc_EXCEPTION) {
8600 return -1;
8601 }
8602 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008603 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 return -1;
8605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008606 }
8607 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008608 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008609 *inpos = collendpos;
8610 break;
Victor Stinner50149202015-09-22 00:26:54 +02008611
8612 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008613 /* generate replacement (temporarily (mis)uses p) */
8614 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 char buffer[2+29+1+1];
8616 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008617 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 for (cp = buffer; *cp; ++cp) {
8619 x = charmapencode_output(*cp, mapping, res, respos);
8620 if (x==enc_EXCEPTION)
8621 return -1;
8622 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008623 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 return -1;
8625 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008626 }
8627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 *inpos = collendpos;
8629 break;
Victor Stinner50149202015-09-22 00:26:54 +02008630
Benjamin Peterson14339b62009-01-31 16:36:08 +00008631 default:
Victor Stinner50149202015-09-22 00:26:54 +02008632 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008633 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008635 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008637 if (PyBytes_Check(repunicode)) {
8638 /* Directly copy bytes result to output. */
8639 Py_ssize_t outsize = PyBytes_Size(*res);
8640 Py_ssize_t requiredsize;
8641 repsize = PyBytes_Size(repunicode);
8642 requiredsize = *respos + repsize;
8643 if (requiredsize > outsize)
8644 /* Make room for all additional bytes. */
8645 if (charmapencode_resize(res, respos, requiredsize)) {
8646 Py_DECREF(repunicode);
8647 return -1;
8648 }
8649 memcpy(PyBytes_AsString(*res) + *respos,
8650 PyBytes_AsString(repunicode), repsize);
8651 *respos += repsize;
8652 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008653 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008654 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008656 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008657 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008658 Py_DECREF(repunicode);
8659 return -1;
8660 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008661 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008662 data = PyUnicode_DATA(repunicode);
8663 kind = PyUnicode_KIND(repunicode);
8664 for (index = 0; index < repsize; index++) {
8665 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8666 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008668 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return -1;
8670 }
8671 else if (x==enc_FAILED) {
8672 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008673 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return -1;
8675 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008676 }
8677 *inpos = newpos;
8678 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 }
8680 return 0;
8681}
8682
Alexander Belopolsky40018472011-02-26 01:02:56 +00008683PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008684_PyUnicode_EncodeCharmap(PyObject *unicode,
8685 PyObject *mapping,
8686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 /* output object */
8689 PyObject *res = NULL;
8690 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008692 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008695 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008697 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008698 void *data;
8699 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700
Benjamin Petersonbac79492012-01-14 13:34:47 -05008701 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008702 return NULL;
8703 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008704 data = PyUnicode_DATA(unicode);
8705 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 /* Default to Latin-1 */
8708 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008709 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 /* allocate enough for a simple encoding without
8712 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008713 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 if (res == NULL)
8715 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008716 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008720 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008722 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 if (x==enc_EXCEPTION) /* error */
8724 goto onError;
8725 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008726 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008728 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 &res, &respos)) {
8730 goto onError;
8731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 else
8734 /* done with this character => adjust input position */
8735 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008739 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008740 if (_PyBytes_Resize(&res, respos) < 0)
8741 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008744 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 return res;
8746
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748 Py_XDECREF(res);
8749 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008750 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 return NULL;
8752}
8753
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008754/* Deprecated */
8755PyObject *
8756PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8757 Py_ssize_t size,
8758 PyObject *mapping,
8759 const char *errors)
8760{
8761 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008762 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008763 if (unicode == NULL)
8764 return NULL;
8765 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8766 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008767 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008768}
8769
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770PyObject *
8771PyUnicode_AsCharmapString(PyObject *unicode,
8772 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773{
8774 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 PyErr_BadArgument();
8776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008778 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779}
8780
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008782static void
8783make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785 Py_ssize_t startpos, Py_ssize_t endpos,
8786 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 *exceptionObject = _PyUnicodeTranslateError_Create(
8790 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008791 }
8792 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8794 goto onError;
8795 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8796 goto onError;
8797 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8798 goto onError;
8799 return;
8800 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008801 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 }
8803}
8804
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805/* error handling callback helper:
8806 build arguments, call the callback and check the arguments,
8807 put the result into newpos and return the replacement string, which
8808 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008809static PyObject *
8810unicode_translate_call_errorhandler(const char *errors,
8811 PyObject **errorHandler,
8812 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008814 Py_ssize_t startpos, Py_ssize_t endpos,
8815 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008817 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008819 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 PyObject *restuple;
8821 PyObject *resunicode;
8822
8823 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827 }
8828
8829 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833
Petr Viktorinffd97532020-02-11 17:46:57 +01008834 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008835 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008837 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008838 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 Py_DECREF(restuple);
8840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008841 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008842 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 &resunicode, &i_newpos)) {
8844 Py_DECREF(restuple);
8845 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008846 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008847 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008849 else
8850 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008852 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 Py_DECREF(restuple);
8854 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008856 Py_INCREF(resunicode);
8857 Py_DECREF(restuple);
8858 return resunicode;
8859}
8860
8861/* Lookup the character ch in the mapping and put the result in result,
8862 which must be decrefed by the caller.
8863 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008864static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866{
Christian Heimes217cfd12007-12-02 14:31:20 +00008867 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008868 PyObject *x;
8869
8870 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 x = PyObject_GetItem(mapping, w);
8873 Py_DECREF(w);
8874 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8876 /* No mapping found means: use 1:1 mapping. */
8877 PyErr_Clear();
8878 *result = NULL;
8879 return 0;
8880 } else
8881 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 }
8883 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 *result = x;
8885 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008887 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008889 if (value < 0 || value > MAX_UNICODE) {
8890 PyErr_Format(PyExc_ValueError,
8891 "character mapping must be in range(0x%x)",
8892 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 Py_DECREF(x);
8894 return -1;
8895 }
8896 *result = x;
8897 return 0;
8898 }
8899 else if (PyUnicode_Check(x)) {
8900 *result = x;
8901 return 0;
8902 }
8903 else {
8904 /* wrong return value */
8905 PyErr_SetString(PyExc_TypeError,
8906 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008907 Py_DECREF(x);
8908 return -1;
8909 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910}
Victor Stinner1194ea02014-04-04 19:37:40 +02008911
8912/* lookup the character, write the result into the writer.
8913 Return 1 if the result was written into the writer, return 0 if the mapping
8914 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008915static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008916charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8917 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918{
Victor Stinner1194ea02014-04-04 19:37:40 +02008919 PyObject *item;
8920
8921 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008923
8924 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008926 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008929 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008930 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008931
8932 if (item == Py_None) {
8933 Py_DECREF(item);
8934 return 0;
8935 }
8936
8937 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008938 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8939 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8940 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008941 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8942 Py_DECREF(item);
8943 return -1;
8944 }
8945 Py_DECREF(item);
8946 return 1;
8947 }
8948
8949 if (!PyUnicode_Check(item)) {
8950 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008952 }
8953
8954 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8955 Py_DECREF(item);
8956 return -1;
8957 }
8958
8959 Py_DECREF(item);
8960 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961}
8962
Victor Stinner89a76ab2014-04-05 11:44:04 +02008963static int
8964unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8965 Py_UCS1 *translate)
8966{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008967 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 int ret = 0;
8969
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970 if (charmaptranslate_lookup(ch, mapping, &item)) {
8971 return -1;
8972 }
8973
8974 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008975 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008976 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008978 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979 /* not found => default to 1:1 mapping */
8980 translate[ch] = ch;
8981 return 1;
8982 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008983 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008984 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008985 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8986 used it */
8987 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008988 /* invalid character or character outside ASCII:
8989 skip the fast translate */
8990 goto exit;
8991 }
8992 translate[ch] = (Py_UCS1)replace;
8993 }
8994 else if (PyUnicode_Check(item)) {
8995 Py_UCS4 replace;
8996
8997 if (PyUnicode_READY(item) == -1) {
8998 Py_DECREF(item);
8999 return -1;
9000 }
9001 if (PyUnicode_GET_LENGTH(item) != 1)
9002 goto exit;
9003
9004 replace = PyUnicode_READ_CHAR(item, 0);
9005 if (replace > 127)
9006 goto exit;
9007 translate[ch] = (Py_UCS1)replace;
9008 }
9009 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009010 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009011 goto exit;
9012 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009013 ret = 1;
9014
Benjamin Peterson1365de72014-04-07 20:15:41 -04009015 exit:
9016 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009017 return ret;
9018}
9019
9020/* Fast path for ascii => ascii translation. Return 1 if the whole string
9021 was translated into writer, return 0 if the input string was partially
9022 translated into writer, raise an exception and return -1 on error. */
9023static int
9024unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009025 _PyUnicodeWriter *writer, int ignore,
9026 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027{
Victor Stinner872b2912014-04-05 14:27:07 +02009028 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009029 Py_ssize_t len;
9030 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009031 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032
Victor Stinner89a76ab2014-04-05 11:44:04 +02009033 len = PyUnicode_GET_LENGTH(input);
9034
Victor Stinner872b2912014-04-05 14:27:07 +02009035 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009036
9037 in = PyUnicode_1BYTE_DATA(input);
9038 end = in + len;
9039
9040 assert(PyUnicode_IS_ASCII(writer->buffer));
9041 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9042 out = PyUnicode_1BYTE_DATA(writer->buffer);
9043
Victor Stinner872b2912014-04-05 14:27:07 +02009044 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009045 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009046 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009047 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009048 int translate = unicode_fast_translate_lookup(mapping, ch,
9049 ascii_table);
9050 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009051 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009052 if (translate == 0)
9053 goto exit;
9054 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009055 }
Victor Stinner872b2912014-04-05 14:27:07 +02009056 if (ch2 == 0xfe) {
9057 if (ignore)
9058 continue;
9059 goto exit;
9060 }
9061 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009063 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009064 }
Victor Stinner872b2912014-04-05 14:27:07 +02009065 res = 1;
9066
9067exit:
9068 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009069 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009070 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009071}
9072
Victor Stinner3222da22015-10-01 22:07:32 +02009073static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074_PyUnicode_TranslateCharmap(PyObject *input,
9075 PyObject *mapping,
9076 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009079 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 Py_ssize_t size, i;
9081 int kind;
9082 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009083 _PyUnicodeWriter writer;
9084 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009085 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009086 PyObject *errorHandler = NULL;
9087 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009088 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009089 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009090
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 PyErr_BadArgument();
9093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 if (PyUnicode_READY(input) == -1)
9097 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009098 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 kind = PyUnicode_KIND(input);
9100 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009102 if (size == 0)
9103 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009105 /* allocate enough for a simple 1:1 translation without
9106 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009107 _PyUnicodeWriter_Init(&writer);
9108 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110
Victor Stinner872b2912014-04-05 14:27:07 +02009111 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9112
Victor Stinner33798672016-03-01 21:59:58 +01009113 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009114 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009115 if (PyUnicode_IS_ASCII(input)) {
9116 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9117 if (res < 0) {
9118 _PyUnicodeWriter_Dealloc(&writer);
9119 return NULL;
9120 }
9121 if (res == 1)
9122 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009123 }
Victor Stinner33798672016-03-01 21:59:58 +01009124 else {
9125 i = 0;
9126 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009130 int translate;
9131 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9132 Py_ssize_t newpos;
9133 /* startpos for collecting untranslatable chars */
9134 Py_ssize_t collstart;
9135 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009136 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
Victor Stinner1194ea02014-04-04 19:37:40 +02009138 ch = PyUnicode_READ(kind, data, i);
9139 translate = charmaptranslate_output(ch, mapping, &writer);
9140 if (translate < 0)
9141 goto onError;
9142
9143 if (translate != 0) {
9144 /* it worked => adjust input pointer */
9145 ++i;
9146 continue;
9147 }
9148
9149 /* untranslatable character */
9150 collstart = i;
9151 collend = i+1;
9152
9153 /* find all untranslatable characters */
9154 while (collend < size) {
9155 PyObject *x;
9156 ch = PyUnicode_READ(kind, data, collend);
9157 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009158 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009159 Py_XDECREF(x);
9160 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009162 ++collend;
9163 }
9164
9165 if (ignore) {
9166 i = collend;
9167 }
9168 else {
9169 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9170 reason, input, &exc,
9171 collstart, collend, &newpos);
9172 if (repunicode == NULL)
9173 goto onError;
9174 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009175 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009177 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009178 Py_DECREF(repunicode);
9179 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009180 }
9181 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009182 Py_XDECREF(exc);
9183 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009184 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009187 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009188 Py_XDECREF(exc);
9189 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190 return NULL;
9191}
9192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193/* Deprecated. Use PyUnicode_Translate instead. */
9194PyObject *
9195PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9196 Py_ssize_t size,
9197 PyObject *mapping,
9198 const char *errors)
9199{
Christian Heimes5f520f42012-09-11 14:03:25 +02009200 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009201 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 if (!unicode)
9203 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009204 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9205 Py_DECREF(unicode);
9206 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207}
9208
Alexander Belopolsky40018472011-02-26 01:02:56 +00009209PyObject *
9210PyUnicode_Translate(PyObject *str,
9211 PyObject *mapping,
9212 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009214 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009215 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009216 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217}
Tim Petersced69f82003-09-16 20:30:58 +00009218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219PyObject *
9220_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9221{
9222 if (!PyUnicode_Check(unicode)) {
9223 PyErr_BadInternalCall();
9224 return NULL;
9225 }
9226 if (PyUnicode_READY(unicode) == -1)
9227 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009228 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 /* If the string is already ASCII, just return the same string */
9230 Py_INCREF(unicode);
9231 return unicode;
9232 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009233
9234 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9235 PyObject *result = PyUnicode_New(len, 127);
9236 if (result == NULL) {
9237 return NULL;
9238 }
9239
9240 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9241 int kind = PyUnicode_KIND(unicode);
9242 const void *data = PyUnicode_DATA(unicode);
9243 Py_ssize_t i;
9244 for (i = 0; i < len; ++i) {
9245 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9246 if (ch < 127) {
9247 out[i] = ch;
9248 }
9249 else if (Py_UNICODE_ISSPACE(ch)) {
9250 out[i] = ' ';
9251 }
9252 else {
9253 int decimal = Py_UNICODE_TODECIMAL(ch);
9254 if (decimal < 0) {
9255 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009256 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009257 _PyUnicode_LENGTH(result) = i + 1;
9258 break;
9259 }
9260 out[i] = '0' + decimal;
9261 }
9262 }
9263
INADA Naoki16dfca42018-07-14 12:06:43 +09009264 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009265 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266}
9267
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009268PyObject *
9269PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9270 Py_ssize_t length)
9271{
Victor Stinnerf0124502011-11-21 23:12:56 +01009272 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009273 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009274 Py_UCS4 maxchar;
9275 enum PyUnicode_Kind kind;
9276 void *data;
9277
Victor Stinner99d7ad02012-02-22 13:37:39 +01009278 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009279 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009280 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009281 if (ch > 127) {
9282 int decimal = Py_UNICODE_TODECIMAL(ch);
9283 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009284 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009285 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009286 }
9287 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009288
9289 /* Copy to a new string */
9290 decimal = PyUnicode_New(length, maxchar);
9291 if (decimal == NULL)
9292 return decimal;
9293 kind = PyUnicode_KIND(decimal);
9294 data = PyUnicode_DATA(decimal);
9295 /* Iterate over code points */
9296 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009297 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009298 if (ch > 127) {
9299 int decimal = Py_UNICODE_TODECIMAL(ch);
9300 if (decimal >= 0)
9301 ch = '0' + decimal;
9302 }
9303 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009305 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009306}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009307/* --- Decimal Encoder ---------------------------------------------------- */
9308
Alexander Belopolsky40018472011-02-26 01:02:56 +00009309int
9310PyUnicode_EncodeDecimal(Py_UNICODE *s,
9311 Py_ssize_t length,
9312 char *output,
9313 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009314{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009315 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009316 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009317 enum PyUnicode_Kind kind;
9318 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009319
9320 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 PyErr_BadArgument();
9322 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009323 }
9324
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009325 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009326 if (unicode == NULL)
9327 return -1;
9328
Victor Stinner42bf7752011-11-21 22:52:58 +01009329 kind = PyUnicode_KIND(unicode);
9330 data = PyUnicode_DATA(unicode);
9331
Victor Stinnerb84d7232011-11-22 01:50:07 +01009332 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009333 PyObject *exc;
9334 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009336 Py_ssize_t startpos;
9337
9338 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009339
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009342 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009344 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 decimal = Py_UNICODE_TODECIMAL(ch);
9346 if (decimal >= 0) {
9347 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009348 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 continue;
9350 }
9351 if (0 < ch && ch < 256) {
9352 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009353 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 continue;
9355 }
Victor Stinner6345be92011-11-25 20:09:01 +01009356
Victor Stinner42bf7752011-11-21 22:52:58 +01009357 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009358 exc = NULL;
9359 raise_encode_exception(&exc, "decimal", unicode,
9360 startpos, startpos+1,
9361 "invalid decimal Unicode string");
9362 Py_XDECREF(exc);
9363 Py_DECREF(unicode);
9364 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009365 }
9366 /* 0-terminate the output string */
9367 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009368 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009369 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009370}
9371
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372/* --- Helpers ------------------------------------------------------------ */
9373
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009374/* helper macro to fixup start/end slice values */
9375#define ADJUST_INDICES(start, end, len) \
9376 if (end > len) \
9377 end = len; \
9378 else if (end < 0) { \
9379 end += len; \
9380 if (end < 0) \
9381 end = 0; \
9382 } \
9383 if (start < 0) { \
9384 start += len; \
9385 if (start < 0) \
9386 start = 0; \
9387 }
9388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009390any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009392 Py_ssize_t end,
9393 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009395 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 void *buf1, *buf2;
9397 Py_ssize_t len1, len2, result;
9398
9399 kind1 = PyUnicode_KIND(s1);
9400 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009401 if (kind1 < kind2)
9402 return -1;
9403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 len1 = PyUnicode_GET_LENGTH(s1);
9405 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009406 ADJUST_INDICES(start, end, len1);
9407 if (end - start < len2)
9408 return -1;
9409
9410 buf1 = PyUnicode_DATA(s1);
9411 buf2 = PyUnicode_DATA(s2);
9412 if (len2 == 1) {
9413 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9414 result = findchar((const char *)buf1 + kind1*start,
9415 kind1, end - start, ch, direction);
9416 if (result == -1)
9417 return -1;
9418 else
9419 return start + result;
9420 }
9421
9422 if (kind2 != kind1) {
9423 buf2 = _PyUnicode_AsKind(s2, kind1);
9424 if (!buf2)
9425 return -2;
9426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427
Victor Stinner794d5672011-10-10 03:21:36 +02009428 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009429 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009430 case PyUnicode_1BYTE_KIND:
9431 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9432 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9433 else
9434 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9435 break;
9436 case PyUnicode_2BYTE_KIND:
9437 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9438 break;
9439 case PyUnicode_4BYTE_KIND:
9440 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9441 break;
9442 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009443 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009444 }
9445 }
9446 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009447 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009448 case PyUnicode_1BYTE_KIND:
9449 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9450 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9451 else
9452 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453 break;
9454 case PyUnicode_2BYTE_KIND:
9455 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456 break;
9457 case PyUnicode_4BYTE_KIND:
9458 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9459 break;
9460 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009461 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 }
9464
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009465 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 PyMem_Free(buf2);
9467
9468 return result;
9469}
9470
Victor Stinner59423e32018-11-26 13:40:01 +01009471/* _PyUnicode_InsertThousandsGrouping() helper functions */
9472#include "stringlib/localeutil.h"
9473
9474/**
9475 * InsertThousandsGrouping:
9476 * @writer: Unicode writer.
9477 * @n_buffer: Number of characters in @buffer.
9478 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9479 * @d_pos: Start of digits string.
9480 * @n_digits: The number of digits in the string, in which we want
9481 * to put the grouping chars.
9482 * @min_width: The minimum width of the digits in the output string.
9483 * Output will be zero-padded on the left to fill.
9484 * @grouping: see definition in localeconv().
9485 * @thousands_sep: see definition in localeconv().
9486 *
9487 * There are 2 modes: counting and filling. If @writer is NULL,
9488 * we are in counting mode, else filling mode.
9489 * If counting, the required buffer size is returned.
9490 * If filling, we know the buffer will be large enough, so we don't
9491 * need to pass in the buffer size.
9492 * Inserts thousand grouping characters (as defined by grouping and
9493 * thousands_sep) into @writer.
9494 *
9495 * Return value: -1 on error, number of characters otherwise.
9496 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009498_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009499 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009500 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009501 PyObject *digits,
9502 Py_ssize_t d_pos,
9503 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009504 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009505 const char *grouping,
9506 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009507 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508{
Xtreak3f7983a2019-01-07 20:39:14 +05309509 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009510 if (writer) {
9511 assert(digits != NULL);
9512 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009513 }
9514 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009515 assert(digits == NULL);
9516 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009517 }
Victor Stinner59423e32018-11-26 13:40:01 +01009518 assert(0 <= d_pos);
9519 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009520 assert(grouping != NULL);
9521
9522 if (digits != NULL) {
9523 if (PyUnicode_READY(digits) == -1) {
9524 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009525 }
Victor Stinner59423e32018-11-26 13:40:01 +01009526 }
9527 if (PyUnicode_READY(thousands_sep) == -1) {
9528 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009529 }
9530
Victor Stinner59423e32018-11-26 13:40:01 +01009531 Py_ssize_t count = 0;
9532 Py_ssize_t n_zeros;
9533 int loop_broken = 0;
9534 int use_separator = 0; /* First time through, don't append the
9535 separator. They only go between
9536 groups. */
9537 Py_ssize_t buffer_pos;
9538 Py_ssize_t digits_pos;
9539 Py_ssize_t len;
9540 Py_ssize_t n_chars;
9541 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9542 be looked at */
9543 /* A generator that returns all of the grouping widths, until it
9544 returns 0. */
9545 GroupGenerator groupgen;
9546 GroupGenerator_init(&groupgen, grouping);
9547 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9548
9549 /* if digits are not grouped, thousands separator
9550 should be an empty string */
9551 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9552
9553 digits_pos = d_pos + n_digits;
9554 if (writer) {
9555 buffer_pos = writer->pos + n_buffer;
9556 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9557 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 }
Victor Stinner59423e32018-11-26 13:40:01 +01009559 else {
9560 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009561 }
Victor Stinner59423e32018-11-26 13:40:01 +01009562
9563 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009564 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009565 }
Victor Stinner59423e32018-11-26 13:40:01 +01009566
9567 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9568 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9569 n_zeros = Py_MAX(0, len - remaining);
9570 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9571
9572 /* Use n_zero zero's and n_chars chars */
9573
9574 /* Count only, don't do anything. */
9575 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9576
9577 /* Copy into the writer. */
9578 InsertThousandsGrouping_fill(writer, &buffer_pos,
9579 digits, &digits_pos,
9580 n_chars, n_zeros,
9581 use_separator ? thousands_sep : NULL,
9582 thousands_sep_len, maxchar);
9583
9584 /* Use a separator next time. */
9585 use_separator = 1;
9586
9587 remaining -= n_chars;
9588 min_width -= len;
9589
9590 if (remaining <= 0 && min_width <= 0) {
9591 loop_broken = 1;
9592 break;
9593 }
9594 min_width -= thousands_sep_len;
9595 }
9596 if (!loop_broken) {
9597 /* We left the loop without using a break statement. */
9598
9599 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9600 n_zeros = Py_MAX(0, len - remaining);
9601 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9602
9603 /* Use n_zero zero's and n_chars chars */
9604 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9605
9606 /* Copy into the writer. */
9607 InsertThousandsGrouping_fill(writer, &buffer_pos,
9608 digits, &digits_pos,
9609 n_chars, n_zeros,
9610 use_separator ? thousands_sep : NULL,
9611 thousands_sep_len, maxchar);
9612 }
9613 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614}
9615
9616
Alexander Belopolsky40018472011-02-26 01:02:56 +00009617Py_ssize_t
9618PyUnicode_Count(PyObject *str,
9619 PyObject *substr,
9620 Py_ssize_t start,
9621 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009623 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009624 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 void *buf1 = NULL, *buf2 = NULL;
9626 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009628 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009630
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009631 kind1 = PyUnicode_KIND(str);
9632 kind2 = PyUnicode_KIND(substr);
9633 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009634 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009635
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009636 len1 = PyUnicode_GET_LENGTH(str);
9637 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009639 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009640 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009641
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009642 buf1 = PyUnicode_DATA(str);
9643 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009644 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009645 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009646 if (!buf2)
9647 goto onError;
9648 }
9649
9650 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009652 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009653 result = asciilib_count(
9654 ((Py_UCS1*)buf1) + start, end - start,
9655 buf2, len2, PY_SSIZE_T_MAX
9656 );
9657 else
9658 result = ucs1lib_count(
9659 ((Py_UCS1*)buf1) + start, end - start,
9660 buf2, len2, PY_SSIZE_T_MAX
9661 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 break;
9663 case PyUnicode_2BYTE_KIND:
9664 result = ucs2lib_count(
9665 ((Py_UCS2*)buf1) + start, end - start,
9666 buf2, len2, PY_SSIZE_T_MAX
9667 );
9668 break;
9669 case PyUnicode_4BYTE_KIND:
9670 result = ucs4lib_count(
9671 ((Py_UCS4*)buf1) + start, end - start,
9672 buf2, len2, PY_SSIZE_T_MAX
9673 );
9674 break;
9675 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009676 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009678
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009679 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 PyMem_Free(buf2);
9681
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009684 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 PyMem_Free(buf2);
9686 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687}
9688
Alexander Belopolsky40018472011-02-26 01:02:56 +00009689Py_ssize_t
9690PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009691 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009692 Py_ssize_t start,
9693 Py_ssize_t end,
9694 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009696 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009697 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009699 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700}
9701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702Py_ssize_t
9703PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9704 Py_ssize_t start, Py_ssize_t end,
9705 int direction)
9706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009708 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 if (PyUnicode_READY(str) == -1)
9710 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009711 len = PyUnicode_GET_LENGTH(str);
9712 ADJUST_INDICES(start, end, len);
9713 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009714 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009716 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9717 kind, end-start, ch, direction);
9718 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009720 else
9721 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722}
9723
Alexander Belopolsky40018472011-02-26 01:02:56 +00009724static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009725tailmatch(PyObject *self,
9726 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009727 Py_ssize_t start,
9728 Py_ssize_t end,
9729 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 int kind_self;
9732 int kind_sub;
9733 void *data_self;
9734 void *data_sub;
9735 Py_ssize_t offset;
9736 Py_ssize_t i;
9737 Py_ssize_t end_sub;
9738
9739 if (PyUnicode_READY(self) == -1 ||
9740 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009741 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9744 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009748 if (PyUnicode_GET_LENGTH(substring) == 0)
9749 return 1;
9750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 kind_self = PyUnicode_KIND(self);
9752 data_self = PyUnicode_DATA(self);
9753 kind_sub = PyUnicode_KIND(substring);
9754 data_sub = PyUnicode_DATA(substring);
9755 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9756
9757 if (direction > 0)
9758 offset = end;
9759 else
9760 offset = start;
9761
9762 if (PyUnicode_READ(kind_self, data_self, offset) ==
9763 PyUnicode_READ(kind_sub, data_sub, 0) &&
9764 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9765 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9766 /* If both are of the same kind, memcmp is sufficient */
9767 if (kind_self == kind_sub) {
9768 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 data_sub,
9771 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009772 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009774 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 else {
9776 /* We do not need to compare 0 and len(substring)-1 because
9777 the if statement above ensured already that they are equal
9778 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 for (i = 1; i < end_sub; ++i) {
9780 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9781 PyUnicode_READ(kind_sub, data_sub, i))
9782 return 0;
9783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 }
9787
9788 return 0;
9789}
9790
Alexander Belopolsky40018472011-02-26 01:02:56 +00009791Py_ssize_t
9792PyUnicode_Tailmatch(PyObject *str,
9793 PyObject *substr,
9794 Py_ssize_t start,
9795 Py_ssize_t end,
9796 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009798 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009799 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009801 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802}
9803
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009804static PyObject *
9805ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009807 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9808 char *resdata, *data = PyUnicode_DATA(self);
9809 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009810
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009811 res = PyUnicode_New(len, 127);
9812 if (res == NULL)
9813 return NULL;
9814 resdata = PyUnicode_DATA(res);
9815 if (lower)
9816 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 _Py_bytes_upper(resdata, data, len);
9819 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820}
9821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009823handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009825 Py_ssize_t j;
9826 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009827 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009829
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009830 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9831
9832 where ! is a negation and \p{xxx} is a character with property xxx.
9833 */
9834 for (j = i - 1; j >= 0; j--) {
9835 c = PyUnicode_READ(kind, data, j);
9836 if (!_PyUnicode_IsCaseIgnorable(c))
9837 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9840 if (final_sigma) {
9841 for (j = i + 1; j < length; j++) {
9842 c = PyUnicode_READ(kind, data, j);
9843 if (!_PyUnicode_IsCaseIgnorable(c))
9844 break;
9845 }
9846 final_sigma = j == length || !_PyUnicode_IsCased(c);
9847 }
9848 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849}
9850
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009851static int
9852lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9853 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009855 /* Obscure special case. */
9856 if (c == 0x3A3) {
9857 mapped[0] = handle_capital_sigma(kind, data, length, i);
9858 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009860 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861}
9862
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009863static Py_ssize_t
9864do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009866 Py_ssize_t i, k = 0;
9867 int n_res, j;
9868 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009869
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009871 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009873 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009874 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009876 for (i = 1; i < length; i++) {
9877 c = PyUnicode_READ(kind, data, i);
9878 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9879 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009880 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009882 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009883 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009884 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885}
9886
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009887static Py_ssize_t
9888do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9889 Py_ssize_t i, k = 0;
9890
9891 for (i = 0; i < length; i++) {
9892 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9893 int n_res, j;
9894 if (Py_UNICODE_ISUPPER(c)) {
9895 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9896 }
9897 else if (Py_UNICODE_ISLOWER(c)) {
9898 n_res = _PyUnicode_ToUpperFull(c, mapped);
9899 }
9900 else {
9901 n_res = 1;
9902 mapped[0] = c;
9903 }
9904 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009905 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009906 res[k++] = mapped[j];
9907 }
9908 }
9909 return k;
9910}
9911
9912static Py_ssize_t
9913do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9914 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 Py_ssize_t i, k = 0;
9917
9918 for (i = 0; i < length; i++) {
9919 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9920 int n_res, j;
9921 if (lower)
9922 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9923 else
9924 n_res = _PyUnicode_ToUpperFull(c, mapped);
9925 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009926 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009927 res[k++] = mapped[j];
9928 }
9929 }
9930 return k;
9931}
9932
9933static Py_ssize_t
9934do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9935{
9936 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9937}
9938
9939static Py_ssize_t
9940do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9941{
9942 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9943}
9944
Benjamin Petersone51757f2012-01-12 21:10:29 -05009945static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009946do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9947{
9948 Py_ssize_t i, k = 0;
9949
9950 for (i = 0; i < length; i++) {
9951 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9952 Py_UCS4 mapped[3];
9953 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9954 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009955 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009956 res[k++] = mapped[j];
9957 }
9958 }
9959 return k;
9960}
9961
9962static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009963do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9964{
9965 Py_ssize_t i, k = 0;
9966 int previous_is_cased;
9967
9968 previous_is_cased = 0;
9969 for (i = 0; i < length; i++) {
9970 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9971 Py_UCS4 mapped[3];
9972 int n_res, j;
9973
9974 if (previous_is_cased)
9975 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9976 else
9977 n_res = _PyUnicode_ToTitleFull(c, mapped);
9978
9979 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009980 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009981 res[k++] = mapped[j];
9982 }
9983
9984 previous_is_cased = _PyUnicode_IsCased(c);
9985 }
9986 return k;
9987}
9988
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009989static PyObject *
9990case_operation(PyObject *self,
9991 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9992{
9993 PyObject *res = NULL;
9994 Py_ssize_t length, newlength = 0;
9995 int kind, outkind;
9996 void *data, *outdata;
9997 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9998
Benjamin Petersoneea48462012-01-16 14:28:50 -05009999 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010000
10001 kind = PyUnicode_KIND(self);
10002 data = PyUnicode_DATA(self);
10003 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010004 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010005 PyErr_SetString(PyExc_OverflowError, "string is too long");
10006 return NULL;
10007 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010008 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010009 if (tmp == NULL)
10010 return PyErr_NoMemory();
10011 newlength = perform(kind, data, length, tmp, &maxchar);
10012 res = PyUnicode_New(newlength, maxchar);
10013 if (res == NULL)
10014 goto leave;
10015 tmpend = tmp + newlength;
10016 outdata = PyUnicode_DATA(res);
10017 outkind = PyUnicode_KIND(res);
10018 switch (outkind) {
10019 case PyUnicode_1BYTE_KIND:
10020 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10021 break;
10022 case PyUnicode_2BYTE_KIND:
10023 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10024 break;
10025 case PyUnicode_4BYTE_KIND:
10026 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10027 break;
10028 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010029 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010030 }
10031 leave:
10032 PyMem_FREE(tmp);
10033 return res;
10034}
10035
Tim Peters8ce9f162004-08-27 01:49:32 +000010036PyObject *
10037PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010039 PyObject *res;
10040 PyObject *fseq;
10041 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010042 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010044 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010045 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010046 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010047 }
10048
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010049 /* NOTE: the following code can't call back into Python code,
10050 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010051 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010052
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010053 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010054 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010055 res = _PyUnicode_JoinArray(separator, items, seqlen);
10056 Py_DECREF(fseq);
10057 return res;
10058}
10059
10060PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010061_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010062{
10063 PyObject *res = NULL; /* the result */
10064 PyObject *sep = NULL;
10065 Py_ssize_t seplen;
10066 PyObject *item;
10067 Py_ssize_t sz, i, res_offset;
10068 Py_UCS4 maxchar;
10069 Py_UCS4 item_maxchar;
10070 int use_memcpy;
10071 unsigned char *res_data = NULL, *sep_data = NULL;
10072 PyObject *last_obj;
10073 unsigned int kind = 0;
10074
Tim Peters05eba1f2004-08-27 21:32:02 +000010075 /* If empty sequence, return u"". */
10076 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010077 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010078 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010079
Tim Peters05eba1f2004-08-27 21:32:02 +000010080 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010081 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010082 if (seqlen == 1) {
10083 if (PyUnicode_CheckExact(items[0])) {
10084 res = items[0];
10085 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010086 return res;
10087 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010088 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010089 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010090 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010091 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010092 /* Set up sep and seplen */
10093 if (separator == NULL) {
10094 /* fall back to a blank space separator */
10095 sep = PyUnicode_FromOrdinal(' ');
10096 if (!sep)
10097 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010098 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010099 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010100 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010101 else {
10102 if (!PyUnicode_Check(separator)) {
10103 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010104 "separator: expected str instance,"
10105 " %.80s found",
10106 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010107 goto onError;
10108 }
10109 if (PyUnicode_READY(separator))
10110 goto onError;
10111 sep = separator;
10112 seplen = PyUnicode_GET_LENGTH(separator);
10113 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10114 /* inc refcount to keep this code path symmetric with the
10115 above case of a blank separator */
10116 Py_INCREF(sep);
10117 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010118 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010119 }
10120
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010121 /* There are at least two things to join, or else we have a subclass
10122 * of str in the sequence.
10123 * Do a pre-pass to figure out the total amount of space we'll
10124 * need (sz), and see whether all argument are strings.
10125 */
10126 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010127#ifdef Py_DEBUG
10128 use_memcpy = 0;
10129#else
10130 use_memcpy = 1;
10131#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010132 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010133 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010134 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 if (!PyUnicode_Check(item)) {
10136 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010137 "sequence item %zd: expected str instance,"
10138 " %.80s found",
10139 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010140 goto onError;
10141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (PyUnicode_READY(item) == -1)
10143 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010144 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010146 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010147 if (i != 0) {
10148 add_sz += seplen;
10149 }
10150 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010151 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010152 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010153 goto onError;
10154 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010155 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010156 if (use_memcpy && last_obj != NULL) {
10157 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10158 use_memcpy = 0;
10159 }
10160 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010161 }
Tim Petersced69f82003-09-16 20:30:58 +000010162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010164 if (res == NULL)
10165 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010166
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010167 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010168#ifdef Py_DEBUG
10169 use_memcpy = 0;
10170#else
10171 if (use_memcpy) {
10172 res_data = PyUnicode_1BYTE_DATA(res);
10173 kind = PyUnicode_KIND(res);
10174 if (seplen != 0)
10175 sep_data = PyUnicode_1BYTE_DATA(sep);
10176 }
10177#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010178 if (use_memcpy) {
10179 for (i = 0; i < seqlen; ++i) {
10180 Py_ssize_t itemlen;
10181 item = items[i];
10182
10183 /* Copy item, and maybe the separator. */
10184 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010185 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010186 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010187 kind * seplen);
10188 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010189 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010190
10191 itemlen = PyUnicode_GET_LENGTH(item);
10192 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010193 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010194 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010195 kind * itemlen);
10196 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010197 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010198 }
10199 assert(res_data == PyUnicode_1BYTE_DATA(res)
10200 + kind * PyUnicode_GET_LENGTH(res));
10201 }
10202 else {
10203 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10204 Py_ssize_t itemlen;
10205 item = items[i];
10206
10207 /* Copy item, and maybe the separator. */
10208 if (i && seplen != 0) {
10209 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10210 res_offset += seplen;
10211 }
10212
10213 itemlen = PyUnicode_GET_LENGTH(item);
10214 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010215 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010216 res_offset += itemlen;
10217 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010218 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010219 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010220 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010223 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Benjamin Peterson29060642009-01-31 22:14:21 +000010226 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010228 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 return NULL;
10230}
10231
Victor Stinnerd3f08822012-05-29 12:57:52 +020010232void
10233_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10234 Py_UCS4 fill_char)
10235{
10236 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010237 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010238 assert(PyUnicode_IS_READY(unicode));
10239 assert(unicode_modifiable(unicode));
10240 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10241 assert(start >= 0);
10242 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010243 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010244}
10245
Victor Stinner3fe55312012-01-04 00:33:50 +010010246Py_ssize_t
10247PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10248 Py_UCS4 fill_char)
10249{
10250 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010251
10252 if (!PyUnicode_Check(unicode)) {
10253 PyErr_BadInternalCall();
10254 return -1;
10255 }
10256 if (PyUnicode_READY(unicode) == -1)
10257 return -1;
10258 if (unicode_check_modifiable(unicode))
10259 return -1;
10260
Victor Stinnerd3f08822012-05-29 12:57:52 +020010261 if (start < 0) {
10262 PyErr_SetString(PyExc_IndexError, "string index out of range");
10263 return -1;
10264 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010265 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10266 PyErr_SetString(PyExc_ValueError,
10267 "fill character is bigger than "
10268 "the string maximum character");
10269 return -1;
10270 }
10271
10272 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10273 length = Py_MIN(maxlen, length);
10274 if (length <= 0)
10275 return 0;
10276
Victor Stinnerd3f08822012-05-29 12:57:52 +020010277 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010278 return length;
10279}
10280
Victor Stinner9310abb2011-10-05 00:59:23 +020010281static PyObject *
10282pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010283 Py_ssize_t left,
10284 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 PyObject *u;
10288 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010289 int kind;
10290 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
10292 if (left < 0)
10293 left = 0;
10294 if (right < 0)
10295 right = 0;
10296
Victor Stinnerc4b49542011-12-11 22:44:26 +010010297 if (left == 0 && right == 0)
10298 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10301 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010302 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10303 return NULL;
10304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010306 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010308 if (!u)
10309 return NULL;
10310
10311 kind = PyUnicode_KIND(u);
10312 data = PyUnicode_DATA(u);
10313 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010314 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010315 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010316 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010317 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010318 assert(_PyUnicode_CheckConsistency(u, 1));
10319 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322PyObject *
10323PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010327 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
Benjamin Petersonead6b532011-12-20 17:23:42 -060010330 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 if (PyUnicode_IS_ASCII(string))
10333 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010335 PyUnicode_GET_LENGTH(string), keepends);
10336 else
10337 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010339 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 break;
10341 case PyUnicode_2BYTE_KIND:
10342 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 PyUnicode_GET_LENGTH(string), keepends);
10345 break;
10346 case PyUnicode_4BYTE_KIND:
10347 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 PyUnicode_GET_LENGTH(string), keepends);
10350 break;
10351 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010352 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355}
10356
Alexander Belopolsky40018472011-02-26 01:02:56 +000010357static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010358split(PyObject *self,
10359 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010360 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010362 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 void *buf1, *buf2;
10364 Py_ssize_t len1, len2;
10365 PyObject* out;
10366
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010368 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 if (PyUnicode_READY(self) == -1)
10371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010374 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010376 if (PyUnicode_IS_ASCII(self))
10377 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010378 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010379 PyUnicode_GET_LENGTH(self), maxcount
10380 );
10381 else
10382 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384 PyUnicode_GET_LENGTH(self), maxcount
10385 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 case PyUnicode_2BYTE_KIND:
10387 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 PyUnicode_GET_LENGTH(self), maxcount
10390 );
10391 case PyUnicode_4BYTE_KIND:
10392 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 PyUnicode_GET_LENGTH(self), maxcount
10395 );
10396 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010397 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 }
10399
10400 if (PyUnicode_READY(substring) == -1)
10401 return NULL;
10402
10403 kind1 = PyUnicode_KIND(self);
10404 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 len1 = PyUnicode_GET_LENGTH(self);
10406 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010407 if (kind1 < kind2 || len1 < len2) {
10408 out = PyList_New(1);
10409 if (out == NULL)
10410 return NULL;
10411 Py_INCREF(self);
10412 PyList_SET_ITEM(out, 0, self);
10413 return out;
10414 }
10415 buf1 = PyUnicode_DATA(self);
10416 buf2 = PyUnicode_DATA(substring);
10417 if (kind2 != kind1) {
10418 buf2 = _PyUnicode_AsKind(substring, kind1);
10419 if (!buf2)
10420 return NULL;
10421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010423 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010425 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10426 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010427 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010428 else
10429 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010430 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 break;
10432 case PyUnicode_2BYTE_KIND:
10433 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 break;
10436 case PyUnicode_4BYTE_KIND:
10437 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010438 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 break;
10440 default:
10441 out = NULL;
10442 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010443 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 PyMem_Free(buf2);
10445 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010446}
10447
Alexander Belopolsky40018472011-02-26 01:02:56 +000010448static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010449rsplit(PyObject *self,
10450 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010451 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010452{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010453 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 void *buf1, *buf2;
10455 Py_ssize_t len1, len2;
10456 PyObject* out;
10457
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010458 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010459 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (PyUnicode_READY(self) == -1)
10462 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010465 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010467 if (PyUnicode_IS_ASCII(self))
10468 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010469 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010470 PyUnicode_GET_LENGTH(self), maxcount
10471 );
10472 else
10473 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010474 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010475 PyUnicode_GET_LENGTH(self), maxcount
10476 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 case PyUnicode_2BYTE_KIND:
10478 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010479 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 PyUnicode_GET_LENGTH(self), maxcount
10481 );
10482 case PyUnicode_4BYTE_KIND:
10483 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010484 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 PyUnicode_GET_LENGTH(self), maxcount
10486 );
10487 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010488 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 }
10490
10491 if (PyUnicode_READY(substring) == -1)
10492 return NULL;
10493
10494 kind1 = PyUnicode_KIND(self);
10495 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 len1 = PyUnicode_GET_LENGTH(self);
10497 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010498 if (kind1 < kind2 || len1 < len2) {
10499 out = PyList_New(1);
10500 if (out == NULL)
10501 return NULL;
10502 Py_INCREF(self);
10503 PyList_SET_ITEM(out, 0, self);
10504 return out;
10505 }
10506 buf1 = PyUnicode_DATA(self);
10507 buf2 = PyUnicode_DATA(substring);
10508 if (kind2 != kind1) {
10509 buf2 = _PyUnicode_AsKind(substring, kind1);
10510 if (!buf2)
10511 return NULL;
10512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010514 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10517 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010518 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 else
10520 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 break;
10523 case PyUnicode_2BYTE_KIND:
10524 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010525 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 break;
10527 case PyUnicode_4BYTE_KIND:
10528 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010529 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 break;
10531 default:
10532 out = NULL;
10533 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010534 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 PyMem_Free(buf2);
10536 return out;
10537}
10538
10539static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010540anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10541 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010543 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010545 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10546 return asciilib_find(buf1, len1, buf2, len2, offset);
10547 else
10548 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 case PyUnicode_2BYTE_KIND:
10550 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10551 case PyUnicode_4BYTE_KIND:
10552 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10553 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010554 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555}
10556
10557static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010558anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10559 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010561 switch (kind) {
10562 case PyUnicode_1BYTE_KIND:
10563 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10564 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10565 else
10566 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10567 case PyUnicode_2BYTE_KIND:
10568 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10569 case PyUnicode_4BYTE_KIND:
10570 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10571 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010572 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010573}
10574
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010575static void
10576replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10577 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10578{
10579 int kind = PyUnicode_KIND(u);
10580 void *data = PyUnicode_DATA(u);
10581 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10582 if (kind == PyUnicode_1BYTE_KIND) {
10583 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10584 (Py_UCS1 *)data + len,
10585 u1, u2, maxcount);
10586 }
10587 else if (kind == PyUnicode_2BYTE_KIND) {
10588 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10589 (Py_UCS2 *)data + len,
10590 u1, u2, maxcount);
10591 }
10592 else {
10593 assert(kind == PyUnicode_4BYTE_KIND);
10594 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10595 (Py_UCS4 *)data + len,
10596 u1, u2, maxcount);
10597 }
10598}
10599
Alexander Belopolsky40018472011-02-26 01:02:56 +000010600static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601replace(PyObject *self, PyObject *str1,
10602 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 PyObject *u;
10605 char *sbuf = PyUnicode_DATA(self);
10606 char *buf1 = PyUnicode_DATA(str1);
10607 char *buf2 = PyUnicode_DATA(str2);
10608 int srelease = 0, release1 = 0, release2 = 0;
10609 int skind = PyUnicode_KIND(self);
10610 int kind1 = PyUnicode_KIND(str1);
10611 int kind2 = PyUnicode_KIND(str2);
10612 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10613 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10614 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010616 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010618 if (slen < len1)
10619 goto nothing;
10620
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010623 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010624 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625
Victor Stinner59de0ee2011-10-07 10:01:28 +020010626 if (str1 == str2)
10627 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628
Victor Stinner49a0a212011-10-12 23:46:10 +020010629 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010630 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10631 if (maxchar < maxchar_str1)
10632 /* substring too wide to be present */
10633 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010634 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10635 /* Replacing str1 with str2 may cause a maxchar reduction in the
10636 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010637 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010638 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010643 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010647 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010648
Victor Stinner69ed0f42013-04-09 21:48:24 +020010649 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010650 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010651 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010652 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010653 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010655 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010657
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010658 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10659 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010660 }
10661 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 int rkind = skind;
10663 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010664 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (kind1 < rkind) {
10667 /* widen substring */
10668 buf1 = _PyUnicode_AsKind(str1, rkind);
10669 if (!buf1) goto error;
10670 release1 = 1;
10671 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010672 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 if (i < 0)
10674 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 if (rkind > kind2) {
10676 /* widen replacement */
10677 buf2 = _PyUnicode_AsKind(str2, rkind);
10678 if (!buf2) goto error;
10679 release2 = 1;
10680 }
10681 else if (rkind < kind2) {
10682 /* widen self and buf1 */
10683 rkind = kind2;
10684 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010685 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 sbuf = _PyUnicode_AsKind(self, rkind);
10687 if (!sbuf) goto error;
10688 srelease = 1;
10689 buf1 = _PyUnicode_AsKind(str1, rkind);
10690 if (!buf1) goto error;
10691 release1 = 1;
10692 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010693 u = PyUnicode_New(slen, maxchar);
10694 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010696 assert(PyUnicode_KIND(u) == rkind);
10697 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010698
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010699 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010700 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010701 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010703 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010705
10706 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010707 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010708 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010709 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010710 if (i == -1)
10711 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010712 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010714 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010718 }
10719 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010721 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 int rkind = skind;
10723 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010726 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 buf1 = _PyUnicode_AsKind(str1, rkind);
10728 if (!buf1) goto error;
10729 release1 = 1;
10730 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010731 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 if (n == 0)
10733 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010735 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 buf2 = _PyUnicode_AsKind(str2, rkind);
10737 if (!buf2) goto error;
10738 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010741 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 rkind = kind2;
10743 sbuf = _PyUnicode_AsKind(self, rkind);
10744 if (!sbuf) goto error;
10745 srelease = 1;
10746 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010747 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 buf1 = _PyUnicode_AsKind(str1, rkind);
10749 if (!buf1) goto error;
10750 release1 = 1;
10751 }
10752 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10753 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010754 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 PyErr_SetString(PyExc_OverflowError,
10756 "replace string is too long");
10757 goto error;
10758 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010759 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010760 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010761 _Py_INCREF_UNICODE_EMPTY();
10762 if (!unicode_empty)
10763 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010764 u = unicode_empty;
10765 goto done;
10766 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010767 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 PyErr_SetString(PyExc_OverflowError,
10769 "replace string is too long");
10770 goto error;
10771 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010772 u = PyUnicode_New(new_size, maxchar);
10773 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010775 assert(PyUnicode_KIND(u) == rkind);
10776 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 ires = i = 0;
10778 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010779 while (n-- > 0) {
10780 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010781 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010782 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010783 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010784 if (j == -1)
10785 break;
10786 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010787 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010788 memcpy(res + rkind * ires,
10789 sbuf + rkind * i,
10790 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010792 }
10793 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010795 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010797 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010803 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010804 memcpy(res + rkind * ires,
10805 sbuf + rkind * i,
10806 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010807 }
10808 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010809 /* interleave */
10810 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010811 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010813 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010815 if (--n <= 0)
10816 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010817 memcpy(res + rkind * ires,
10818 sbuf + rkind * i,
10819 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 ires++;
10821 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010822 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010823 memcpy(res + rkind * ires,
10824 sbuf + rkind * i,
10825 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010827 }
10828
10829 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010830 unicode_adjust_maxchar(&u);
10831 if (u == NULL)
10832 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010834
10835 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (srelease)
10837 PyMem_FREE(sbuf);
10838 if (release1)
10839 PyMem_FREE(buf1);
10840 if (release2)
10841 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010842 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010844
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010846 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 if (srelease)
10848 PyMem_FREE(sbuf);
10849 if (release1)
10850 PyMem_FREE(buf1);
10851 if (release2)
10852 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010853 return unicode_result_unchanged(self);
10854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 error:
10856 if (srelease && sbuf)
10857 PyMem_FREE(sbuf);
10858 if (release1 && buf1)
10859 PyMem_FREE(buf1);
10860 if (release2 && buf2)
10861 PyMem_FREE(buf2);
10862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863}
10864
10865/* --- Unicode Object Methods --------------------------------------------- */
10866
INADA Naoki3ae20562017-01-16 20:41:20 +090010867/*[clinic input]
10868str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
INADA Naoki3ae20562017-01-16 20:41:20 +090010870Return a version of the string where each word is titlecased.
10871
10872More specifically, words start with uppercased characters and all remaining
10873cased characters have lower case.
10874[clinic start generated code]*/
10875
10876static PyObject *
10877unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010878/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010880 if (PyUnicode_READY(self) == -1)
10881 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010882 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883}
10884
INADA Naoki3ae20562017-01-16 20:41:20 +090010885/*[clinic input]
10886str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887
INADA Naoki3ae20562017-01-16 20:41:20 +090010888Return a capitalized version of the string.
10889
10890More specifically, make the first character have upper case and the rest lower
10891case.
10892[clinic start generated code]*/
10893
10894static PyObject *
10895unicode_capitalize_impl(PyObject *self)
10896/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010898 if (PyUnicode_READY(self) == -1)
10899 return NULL;
10900 if (PyUnicode_GET_LENGTH(self) == 0)
10901 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010902 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903}
10904
INADA Naoki3ae20562017-01-16 20:41:20 +090010905/*[clinic input]
10906str.casefold as unicode_casefold
10907
10908Return a version of the string suitable for caseless comparisons.
10909[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010910
10911static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010912unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010913/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010914{
10915 if (PyUnicode_READY(self) == -1)
10916 return NULL;
10917 if (PyUnicode_IS_ASCII(self))
10918 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010919 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010920}
10921
10922
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010923/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010924
10925static int
10926convert_uc(PyObject *obj, void *addr)
10927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010929
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010930 if (!PyUnicode_Check(obj)) {
10931 PyErr_Format(PyExc_TypeError,
10932 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010933 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010934 return 0;
10935 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010936 if (PyUnicode_READY(obj) < 0)
10937 return 0;
10938 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010939 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 return 0;
10942 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010943 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010944 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010945}
10946
INADA Naoki3ae20562017-01-16 20:41:20 +090010947/*[clinic input]
10948str.center as unicode_center
10949
10950 width: Py_ssize_t
10951 fillchar: Py_UCS4 = ' '
10952 /
10953
10954Return a centered string of length width.
10955
10956Padding is done using the specified fill character (default is a space).
10957[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010960unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10961/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010963 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
Benjamin Petersonbac79492012-01-14 13:34:47 -050010965 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 return NULL;
10967
Victor Stinnerc4b49542011-12-11 22:44:26 +010010968 if (PyUnicode_GET_LENGTH(self) >= width)
10969 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970
Victor Stinnerc4b49542011-12-11 22:44:26 +010010971 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972 left = marg / 2 + (marg & width & 1);
10973
Victor Stinner9310abb2011-10-05 00:59:23 +020010974 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975}
10976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977/* This function assumes that str1 and str2 are readied by the caller. */
10978
Marc-André Lemburge5034372000-08-08 08:04:29 +000010979static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010980unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010981{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010982#define COMPARE(TYPE1, TYPE2) \
10983 do { \
10984 TYPE1* p1 = (TYPE1 *)data1; \
10985 TYPE2* p2 = (TYPE2 *)data2; \
10986 TYPE1* end = p1 + len; \
10987 Py_UCS4 c1, c2; \
10988 for (; p1 != end; p1++, p2++) { \
10989 c1 = *p1; \
10990 c2 = *p2; \
10991 if (c1 != c2) \
10992 return (c1 < c2) ? -1 : 1; \
10993 } \
10994 } \
10995 while (0)
10996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 int kind1, kind2;
10998 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010999 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 kind1 = PyUnicode_KIND(str1);
11002 kind2 = PyUnicode_KIND(str2);
11003 data1 = PyUnicode_DATA(str1);
11004 data2 = PyUnicode_DATA(str2);
11005 len1 = PyUnicode_GET_LENGTH(str1);
11006 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011007 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011008
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011009 switch(kind1) {
11010 case PyUnicode_1BYTE_KIND:
11011 {
11012 switch(kind2) {
11013 case PyUnicode_1BYTE_KIND:
11014 {
11015 int cmp = memcmp(data1, data2, len);
11016 /* normalize result of memcmp() into the range [-1; 1] */
11017 if (cmp < 0)
11018 return -1;
11019 if (cmp > 0)
11020 return 1;
11021 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011022 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011023 case PyUnicode_2BYTE_KIND:
11024 COMPARE(Py_UCS1, Py_UCS2);
11025 break;
11026 case PyUnicode_4BYTE_KIND:
11027 COMPARE(Py_UCS1, Py_UCS4);
11028 break;
11029 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011030 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011031 }
11032 break;
11033 }
11034 case PyUnicode_2BYTE_KIND:
11035 {
11036 switch(kind2) {
11037 case PyUnicode_1BYTE_KIND:
11038 COMPARE(Py_UCS2, Py_UCS1);
11039 break;
11040 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011041 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011042 COMPARE(Py_UCS2, Py_UCS2);
11043 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011044 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011045 case PyUnicode_4BYTE_KIND:
11046 COMPARE(Py_UCS2, Py_UCS4);
11047 break;
11048 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011049 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011050 }
11051 break;
11052 }
11053 case PyUnicode_4BYTE_KIND:
11054 {
11055 switch(kind2) {
11056 case PyUnicode_1BYTE_KIND:
11057 COMPARE(Py_UCS4, Py_UCS1);
11058 break;
11059 case PyUnicode_2BYTE_KIND:
11060 COMPARE(Py_UCS4, Py_UCS2);
11061 break;
11062 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011063 {
11064#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11065 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11066 /* normalize result of wmemcmp() into the range [-1; 1] */
11067 if (cmp < 0)
11068 return -1;
11069 if (cmp > 0)
11070 return 1;
11071#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011072 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011073#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011074 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011075 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011076 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011077 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011078 }
11079 break;
11080 }
11081 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011082 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011083 }
11084
Victor Stinner770e19e2012-10-04 22:59:45 +020011085 if (len1 == len2)
11086 return 0;
11087 if (len1 < len2)
11088 return -1;
11089 else
11090 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011091
11092#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011093}
11094
Benjamin Peterson621b4302016-09-09 13:54:34 -070011095static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011096unicode_compare_eq(PyObject *str1, PyObject *str2)
11097{
11098 int kind;
11099 void *data1, *data2;
11100 Py_ssize_t len;
11101 int cmp;
11102
Victor Stinnere5567ad2012-10-23 02:48:49 +020011103 len = PyUnicode_GET_LENGTH(str1);
11104 if (PyUnicode_GET_LENGTH(str2) != len)
11105 return 0;
11106 kind = PyUnicode_KIND(str1);
11107 if (PyUnicode_KIND(str2) != kind)
11108 return 0;
11109 data1 = PyUnicode_DATA(str1);
11110 data2 = PyUnicode_DATA(str2);
11111
11112 cmp = memcmp(data1, data2, len * kind);
11113 return (cmp == 0);
11114}
11115
11116
Alexander Belopolsky40018472011-02-26 01:02:56 +000011117int
11118PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11121 if (PyUnicode_READY(left) == -1 ||
11122 PyUnicode_READY(right) == -1)
11123 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011124
11125 /* a string is equal to itself */
11126 if (left == right)
11127 return 0;
11128
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011129 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011131 PyErr_Format(PyExc_TypeError,
11132 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011133 Py_TYPE(left)->tp_name,
11134 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 return -1;
11136}
11137
Martin v. Löwis5b222132007-06-10 09:51:05 +000011138int
11139PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 Py_ssize_t i;
11142 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011144 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145
Victor Stinner910337b2011-10-03 03:20:16 +020011146 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011147 if (!PyUnicode_IS_READY(uni)) {
11148 const wchar_t *ws = _PyUnicode_WSTR(uni);
11149 /* Compare Unicode string and source character set string */
11150 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11151 if (chr != ustr[i])
11152 return (chr < ustr[i]) ? -1 : 1;
11153 }
11154 /* This check keeps Python strings that end in '\0' from comparing equal
11155 to C strings identical up to that point. */
11156 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11157 return 1; /* uni is longer */
11158 if (ustr[i])
11159 return -1; /* str is longer */
11160 return 0;
11161 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011163 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011164 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011165 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011166 size_t len, len2 = strlen(str);
11167 int cmp;
11168
11169 len = Py_MIN(len1, len2);
11170 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011171 if (cmp != 0) {
11172 if (cmp < 0)
11173 return -1;
11174 else
11175 return 1;
11176 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011177 if (len1 > len2)
11178 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011179 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011180 return -1; /* str is longer */
11181 return 0;
11182 }
11183 else {
11184 void *data = PyUnicode_DATA(uni);
11185 /* Compare Unicode string and source character set string */
11186 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011187 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011188 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11189 /* This check keeps Python strings that end in '\0' from comparing equal
11190 to C strings identical up to that point. */
11191 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11192 return 1; /* uni is longer */
11193 if (str[i])
11194 return -1; /* str is longer */
11195 return 0;
11196 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011197}
11198
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011199static int
11200non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11201{
11202 size_t i, len;
11203 const wchar_t *p;
11204 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11205 if (strlen(str) != len)
11206 return 0;
11207 p = _PyUnicode_WSTR(unicode);
11208 assert(p);
11209 for (i = 0; i < len; i++) {
11210 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011211 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011212 return 0;
11213 }
11214 return 1;
11215}
11216
11217int
11218_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11219{
11220 size_t len;
11221 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011222 assert(str);
11223#ifndef NDEBUG
11224 for (const char *p = str; *p; p++) {
11225 assert((unsigned char)*p < 128);
11226 }
11227#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011228 if (PyUnicode_READY(unicode) == -1) {
11229 /* Memory error or bad data */
11230 PyErr_Clear();
11231 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11232 }
11233 if (!PyUnicode_IS_ASCII(unicode))
11234 return 0;
11235 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11236 return strlen(str) == len &&
11237 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11238}
11239
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011240int
11241_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11242{
11243 PyObject *right_uni;
11244 Py_hash_t hash;
11245
11246 assert(_PyUnicode_CHECK(left));
11247 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011248#ifndef NDEBUG
11249 for (const char *p = right->string; *p; p++) {
11250 assert((unsigned char)*p < 128);
11251 }
11252#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011253
11254 if (PyUnicode_READY(left) == -1) {
11255 /* memory error or bad data */
11256 PyErr_Clear();
11257 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11258 }
11259
11260 if (!PyUnicode_IS_ASCII(left))
11261 return 0;
11262
11263 right_uni = _PyUnicode_FromId(right); /* borrowed */
11264 if (right_uni == NULL) {
11265 /* memory error or bad data */
11266 PyErr_Clear();
11267 return _PyUnicode_EqualToASCIIString(left, right->string);
11268 }
11269
11270 if (left == right_uni)
11271 return 1;
11272
11273 if (PyUnicode_CHECK_INTERNED(left))
11274 return 0;
11275
INADA Naoki7cc95f52018-01-28 02:07:09 +090011276 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011277 hash = _PyUnicode_HASH(left);
11278 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11279 return 0;
11280
11281 return unicode_compare_eq(left, right_uni);
11282}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011283
Alexander Belopolsky40018472011-02-26 01:02:56 +000011284PyObject *
11285PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011286{
11287 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011288
Victor Stinnere5567ad2012-10-23 02:48:49 +020011289 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11290 Py_RETURN_NOTIMPLEMENTED;
11291
11292 if (PyUnicode_READY(left) == -1 ||
11293 PyUnicode_READY(right) == -1)
11294 return NULL;
11295
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011296 if (left == right) {
11297 switch (op) {
11298 case Py_EQ:
11299 case Py_LE:
11300 case Py_GE:
11301 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011302 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011303 case Py_NE:
11304 case Py_LT:
11305 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011306 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011307 default:
11308 PyErr_BadArgument();
11309 return NULL;
11310 }
11311 }
11312 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011313 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011314 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011315 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011316 }
11317 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011318 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011319 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011320 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011321}
11322
Alexander Belopolsky40018472011-02-26 01:02:56 +000011323int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011324_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11325{
11326 return unicode_eq(aa, bb);
11327}
11328
11329int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011330PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011331{
Victor Stinner77282cb2013-04-14 19:22:47 +020011332 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 void *buf1, *buf2;
11334 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011335 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011336
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011337 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011339 "'in <string>' requires string as left operand, not %.100s",
11340 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011341 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011342 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011343 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011344 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011345 if (ensure_unicode(str) < 0)
11346 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011349 kind2 = PyUnicode_KIND(substr);
11350 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011351 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011353 len2 = PyUnicode_GET_LENGTH(substr);
11354 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011355 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011356 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011357 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011358 if (len2 == 1) {
11359 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11360 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011361 return result;
11362 }
11363 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011364 buf2 = _PyUnicode_AsKind(substr, kind1);
11365 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011366 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368
Victor Stinner77282cb2013-04-14 19:22:47 +020011369 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 case PyUnicode_1BYTE_KIND:
11371 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11372 break;
11373 case PyUnicode_2BYTE_KIND:
11374 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11375 break;
11376 case PyUnicode_4BYTE_KIND:
11377 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11378 break;
11379 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011380 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011382
Victor Stinner77282cb2013-04-14 19:22:47 +020011383 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 PyMem_Free(buf2);
11385
Guido van Rossum403d68b2000-03-13 15:55:09 +000011386 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011387}
11388
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389/* Concat to string or Unicode object giving a new Unicode object. */
11390
Alexander Belopolsky40018472011-02-26 01:02:56 +000011391PyObject *
11392PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011394 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011395 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011396 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011398 if (ensure_unicode(left) < 0)
11399 return NULL;
11400
11401 if (!PyUnicode_Check(right)) {
11402 PyErr_Format(PyExc_TypeError,
11403 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011404 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011405 return NULL;
11406 }
11407 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409
11410 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011411 if (left == unicode_empty)
11412 return PyUnicode_FromObject(right);
11413 if (right == unicode_empty)
11414 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011416 left_len = PyUnicode_GET_LENGTH(left);
11417 right_len = PyUnicode_GET_LENGTH(right);
11418 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011419 PyErr_SetString(PyExc_OverflowError,
11420 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011421 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011422 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011423 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11426 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011427 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011430 result = PyUnicode_New(new_len, maxchar);
11431 if (result == NULL)
11432 return NULL;
11433 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11434 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11435 assert(_PyUnicode_CheckConsistency(result, 1));
11436 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437}
11438
Walter Dörwald1ab83302007-05-18 17:15:44 +000011439void
Victor Stinner23e56682011-10-03 03:54:37 +020011440PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011441{
Victor Stinner23e56682011-10-03 03:54:37 +020011442 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011443 Py_UCS4 maxchar, maxchar2;
11444 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011445
11446 if (p_left == NULL) {
11447 if (!PyErr_Occurred())
11448 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011449 return;
11450 }
Victor Stinner23e56682011-10-03 03:54:37 +020011451 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011452 if (right == NULL || left == NULL
11453 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011454 if (!PyErr_Occurred())
11455 PyErr_BadInternalCall();
11456 goto error;
11457 }
11458
Benjamin Petersonbac79492012-01-14 13:34:47 -050011459 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011460 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011461 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011462 goto error;
11463
Victor Stinner488fa492011-12-12 00:01:39 +010011464 /* Shortcuts */
11465 if (left == unicode_empty) {
11466 Py_DECREF(left);
11467 Py_INCREF(right);
11468 *p_left = right;
11469 return;
11470 }
11471 if (right == unicode_empty)
11472 return;
11473
11474 left_len = PyUnicode_GET_LENGTH(left);
11475 right_len = PyUnicode_GET_LENGTH(right);
11476 if (left_len > PY_SSIZE_T_MAX - right_len) {
11477 PyErr_SetString(PyExc_OverflowError,
11478 "strings are too large to concat");
11479 goto error;
11480 }
11481 new_len = left_len + right_len;
11482
11483 if (unicode_modifiable(left)
11484 && PyUnicode_CheckExact(right)
11485 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011486 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11487 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011488 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011489 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011490 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11491 {
11492 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011493 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011494 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011495
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011496 /* copy 'right' into the newly allocated area of 'left' */
11497 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011498 }
Victor Stinner488fa492011-12-12 00:01:39 +010011499 else {
11500 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11501 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011502 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011503
Victor Stinner488fa492011-12-12 00:01:39 +010011504 /* Concat the two Unicode strings */
11505 res = PyUnicode_New(new_len, maxchar);
11506 if (res == NULL)
11507 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011508 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11509 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011510 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011511 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011512 }
11513 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011514 return;
11515
11516error:
Victor Stinner488fa492011-12-12 00:01:39 +010011517 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011518}
11519
11520void
11521PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11522{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011523 PyUnicode_Append(pleft, right);
11524 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011525}
11526
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011527/*
11528Wraps stringlib_parse_args_finds() and additionally ensures that the
11529first argument is a unicode object.
11530*/
11531
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011532static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011533parse_args_finds_unicode(const char * function_name, PyObject *args,
11534 PyObject **substring,
11535 Py_ssize_t *start, Py_ssize_t *end)
11536{
11537 if(stringlib_parse_args_finds(function_name, args, substring,
11538 start, end)) {
11539 if (ensure_unicode(*substring) < 0)
11540 return 0;
11541 return 1;
11542 }
11543 return 0;
11544}
11545
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011546PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011547 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011549Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011550string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011551interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552
11553static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011554unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011556 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011557 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011558 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011560 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 void *buf1, *buf2;
11562 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011564 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 kind1 = PyUnicode_KIND(self);
11568 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011569 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011570 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 len1 = PyUnicode_GET_LENGTH(self);
11573 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011575 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011576 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011577
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011578 buf1 = PyUnicode_DATA(self);
11579 buf2 = PyUnicode_DATA(substring);
11580 if (kind2 != kind1) {
11581 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011582 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011583 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011584 }
11585 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 case PyUnicode_1BYTE_KIND:
11587 iresult = ucs1lib_count(
11588 ((Py_UCS1*)buf1) + start, end - start,
11589 buf2, len2, PY_SSIZE_T_MAX
11590 );
11591 break;
11592 case PyUnicode_2BYTE_KIND:
11593 iresult = ucs2lib_count(
11594 ((Py_UCS2*)buf1) + start, end - start,
11595 buf2, len2, PY_SSIZE_T_MAX
11596 );
11597 break;
11598 case PyUnicode_4BYTE_KIND:
11599 iresult = ucs4lib_count(
11600 ((Py_UCS4*)buf1) + start, end - start,
11601 buf2, len2, PY_SSIZE_T_MAX
11602 );
11603 break;
11604 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011605 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 }
11607
11608 result = PyLong_FromSsize_t(iresult);
11609
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011610 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 return result;
11614}
11615
INADA Naoki3ae20562017-01-16 20:41:20 +090011616/*[clinic input]
11617str.encode as unicode_encode
11618
11619 encoding: str(c_default="NULL") = 'utf-8'
11620 The encoding in which to encode the string.
11621 errors: str(c_default="NULL") = 'strict'
11622 The error handling scheme to use for encoding errors.
11623 The default is 'strict' meaning that encoding errors raise a
11624 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11625 'xmlcharrefreplace' as well as any other name registered with
11626 codecs.register_error that can handle UnicodeEncodeErrors.
11627
11628Encode the string using the codec registered for encoding.
11629[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
11631static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011632unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011633/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011635 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011636}
11637
INADA Naoki3ae20562017-01-16 20:41:20 +090011638/*[clinic input]
11639str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
INADA Naoki3ae20562017-01-16 20:41:20 +090011641 tabsize: int = 8
11642
11643Return a copy where all tab characters are expanded using spaces.
11644
11645If tabsize is not given, a tab size of 8 characters is assumed.
11646[clinic start generated code]*/
11647
11648static PyObject *
11649unicode_expandtabs_impl(PyObject *self, int tabsize)
11650/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011652 Py_ssize_t i, j, line_pos, src_len, incr;
11653 Py_UCS4 ch;
11654 PyObject *u;
11655 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011656 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011657 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
Antoine Pitrou22425222011-10-04 19:10:51 +020011659 if (PyUnicode_READY(self) == -1)
11660 return NULL;
11661
Thomas Wouters7e474022000-07-16 12:04:32 +000011662 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011663 src_len = PyUnicode_GET_LENGTH(self);
11664 i = j = line_pos = 0;
11665 kind = PyUnicode_KIND(self);
11666 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011667 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011668 for (; i < src_len; i++) {
11669 ch = PyUnicode_READ(kind, src_data, i);
11670 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011671 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011673 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011675 goto overflow;
11676 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011677 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011678 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011682 goto overflow;
11683 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011685 if (ch == '\n' || ch == '\r')
11686 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011688 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011689 if (!found)
11690 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011691
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011693 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694 if (!u)
11695 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011696 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
Antoine Pitroue71d5742011-10-04 15:55:09 +020011698 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
Antoine Pitroue71d5742011-10-04 15:55:09 +020011700 for (; i < src_len; i++) {
11701 ch = PyUnicode_READ(kind, src_data, i);
11702 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011704 incr = tabsize - (line_pos % tabsize);
11705 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011706 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011707 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011711 line_pos++;
11712 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011713 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011714 if (ch == '\n' || ch == '\r')
11715 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011717 }
11718 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011719 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011720
Antoine Pitroue71d5742011-10-04 15:55:09 +020011721 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011722 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724}
11725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011726PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011727 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728\n\
11729Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011730such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731arguments start and end are interpreted as in slice notation.\n\
11732\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011733Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011738 /* initialize variables to prevent gcc warning */
11739 PyObject *substring = NULL;
11740 Py_ssize_t start = 0;
11741 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011742 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011744 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011747 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011750 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (result == -2)
11753 return NULL;
11754
Christian Heimes217cfd12007-12-02 14:31:20 +000011755 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756}
11757
11758static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011759unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011761 void *data;
11762 enum PyUnicode_Kind kind;
11763 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011764
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011765 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011766 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011768 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011769 if (PyUnicode_READY(self) == -1) {
11770 return NULL;
11771 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011772 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11773 PyErr_SetString(PyExc_IndexError, "string index out of range");
11774 return NULL;
11775 }
11776 kind = PyUnicode_KIND(self);
11777 data = PyUnicode_DATA(self);
11778 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011779 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780}
11781
Guido van Rossumc2504932007-09-18 19:42:40 +000011782/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011783 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011784static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011785unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011787 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011788
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011789#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011790 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011791#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (_PyUnicode_HASH(self) != -1)
11793 return _PyUnicode_HASH(self);
11794 if (PyUnicode_READY(self) == -1)
11795 return -1;
animalizea1d14252019-01-02 20:16:06 +080011796
Christian Heimes985ecdc2013-11-20 11:46:18 +010011797 x = _Py_HashBytes(PyUnicode_DATA(self),
11798 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011800 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011801}
11802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011803PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011804 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805\n\
oldkaa0735f2018-02-02 16:52:55 +080011806Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011807such that sub is contained within S[start:end]. Optional\n\
11808arguments start and end are interpreted as in slice notation.\n\
11809\n\
11810Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
11812static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011815 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011816 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011817 PyObject *substring = NULL;
11818 Py_ssize_t start = 0;
11819 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011821 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011824 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011827 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (result == -2)
11830 return NULL;
11831
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 if (result < 0) {
11833 PyErr_SetString(PyExc_ValueError, "substring not found");
11834 return NULL;
11835 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011836
Christian Heimes217cfd12007-12-02 14:31:20 +000011837 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838}
11839
INADA Naoki3ae20562017-01-16 20:41:20 +090011840/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011841str.isascii as unicode_isascii
11842
11843Return True if all characters in the string are ASCII, False otherwise.
11844
11845ASCII characters have code points in the range U+0000-U+007F.
11846Empty string is ASCII too.
11847[clinic start generated code]*/
11848
11849static PyObject *
11850unicode_isascii_impl(PyObject *self)
11851/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11852{
11853 if (PyUnicode_READY(self) == -1) {
11854 return NULL;
11855 }
11856 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11857}
11858
11859/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011860str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861
INADA Naoki3ae20562017-01-16 20:41:20 +090011862Return True if the string is a lowercase string, False otherwise.
11863
11864A string is lowercase if all cased characters in the string are lowercase and
11865there is at least one cased character in the string.
11866[clinic start generated code]*/
11867
11868static PyObject *
11869unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011870/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 Py_ssize_t i, length;
11873 int kind;
11874 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 int cased;
11876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (PyUnicode_READY(self) == -1)
11878 return NULL;
11879 length = PyUnicode_GET_LENGTH(self);
11880 kind = PyUnicode_KIND(self);
11881 data = PyUnicode_DATA(self);
11882
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 if (length == 1)
11885 return PyBool_FromLong(
11886 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011888 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011890 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011891
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 for (i = 0; i < length; i++) {
11894 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011895
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011897 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 else if (!cased && Py_UNICODE_ISLOWER(ch))
11899 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011901 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902}
11903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904/*[clinic input]
11905str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
INADA Naoki3ae20562017-01-16 20:41:20 +090011907Return True if the string is an uppercase string, False otherwise.
11908
11909A string is uppercase if all cased characters in the string are uppercase and
11910there is at least one cased character in the string.
11911[clinic start generated code]*/
11912
11913static PyObject *
11914unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011915/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 Py_ssize_t i, length;
11918 int kind;
11919 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 int cased;
11921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (PyUnicode_READY(self) == -1)
11923 return NULL;
11924 length = PyUnicode_GET_LENGTH(self);
11925 kind = PyUnicode_KIND(self);
11926 data = PyUnicode_DATA(self);
11927
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 if (length == 1)
11930 return PyBool_FromLong(
11931 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011933 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011935 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011936
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < length; i++) {
11939 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011940
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011942 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 else if (!cased && Py_UNICODE_ISUPPER(ch))
11944 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011946 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947}
11948
INADA Naoki3ae20562017-01-16 20:41:20 +090011949/*[clinic input]
11950str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951
INADA Naoki3ae20562017-01-16 20:41:20 +090011952Return True if the string is a title-cased string, False otherwise.
11953
11954In a title-cased string, upper- and title-case characters may only
11955follow uncased characters and lowercase characters only cased ones.
11956[clinic start generated code]*/
11957
11958static PyObject *
11959unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011960/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 Py_ssize_t i, length;
11963 int kind;
11964 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 int cased, previous_is_cased;
11966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 if (PyUnicode_READY(self) == -1)
11968 return NULL;
11969 length = PyUnicode_GET_LENGTH(self);
11970 kind = PyUnicode_KIND(self);
11971 data = PyUnicode_DATA(self);
11972
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (length == 1) {
11975 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11976 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11977 (Py_UNICODE_ISUPPER(ch) != 0));
11978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011980 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011983
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984 cased = 0;
11985 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 for (i = 0; i < length; i++) {
11987 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011988
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11990 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011991 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 previous_is_cased = 1;
11993 cased = 1;
11994 }
11995 else if (Py_UNICODE_ISLOWER(ch)) {
11996 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011997 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 previous_is_cased = 1;
11999 cased = 1;
12000 }
12001 else
12002 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012004 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005}
12006
INADA Naoki3ae20562017-01-16 20:41:20 +090012007/*[clinic input]
12008str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
INADA Naoki3ae20562017-01-16 20:41:20 +090012010Return True if the string is a whitespace string, False otherwise.
12011
12012A string is whitespace if all characters in the string are whitespace and there
12013is at least one character in the string.
12014[clinic start generated code]*/
12015
12016static PyObject *
12017unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012018/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 Py_ssize_t i, length;
12021 int kind;
12022 void *data;
12023
12024 if (PyUnicode_READY(self) == -1)
12025 return NULL;
12026 length = PyUnicode_GET_LENGTH(self);
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if (length == 1)
12032 return PyBool_FromLong(
12033 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012035 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012037 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 for (i = 0; i < length; i++) {
12040 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012041 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012044 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045}
12046
INADA Naoki3ae20562017-01-16 20:41:20 +090012047/*[clinic input]
12048str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012049
INADA Naoki3ae20562017-01-16 20:41:20 +090012050Return True if the string is an alphabetic string, False otherwise.
12051
12052A string is alphabetic if all characters in the string are alphabetic and there
12053is at least one character in the string.
12054[clinic start generated code]*/
12055
12056static PyObject *
12057unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012058/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 Py_ssize_t i, length;
12061 int kind;
12062 void *data;
12063
12064 if (PyUnicode_READY(self) == -1)
12065 return NULL;
12066 length = PyUnicode_GET_LENGTH(self);
12067 kind = PyUnicode_KIND(self);
12068 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012069
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012070 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if (length == 1)
12072 return PyBool_FromLong(
12073 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012074
12075 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012077 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 for (i = 0; i < length; i++) {
12080 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012081 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012082 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012083 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012084}
12085
INADA Naoki3ae20562017-01-16 20:41:20 +090012086/*[clinic input]
12087str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012088
INADA Naoki3ae20562017-01-16 20:41:20 +090012089Return True if the string is an alpha-numeric string, False otherwise.
12090
12091A string is alpha-numeric if all characters in the string are alpha-numeric and
12092there is at least one character in the string.
12093[clinic start generated code]*/
12094
12095static PyObject *
12096unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012097/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 int kind;
12100 void *data;
12101 Py_ssize_t len, i;
12102
12103 if (PyUnicode_READY(self) == -1)
12104 return NULL;
12105
12106 kind = PyUnicode_KIND(self);
12107 data = PyUnicode_DATA(self);
12108 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012109
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012110 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (len == 1) {
12112 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12113 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12114 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012115
12116 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 for (i = 0; i < len; i++) {
12121 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012122 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012123 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012124 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012125 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012126}
12127
INADA Naoki3ae20562017-01-16 20:41:20 +090012128/*[clinic input]
12129str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
INADA Naoki3ae20562017-01-16 20:41:20 +090012131Return True if the string is a decimal string, False otherwise.
12132
12133A string is a decimal string if all characters in the string are decimal and
12134there is at least one character in the string.
12135[clinic start generated code]*/
12136
12137static PyObject *
12138unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012139/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 Py_ssize_t i, length;
12142 int kind;
12143 void *data;
12144
12145 if (PyUnicode_READY(self) == -1)
12146 return NULL;
12147 length = PyUnicode_GET_LENGTH(self);
12148 kind = PyUnicode_KIND(self);
12149 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (length == 1)
12153 return PyBool_FromLong(
12154 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012156 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012158 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160 for (i = 0; i < length; i++) {
12161 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012162 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012164 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165}
12166
INADA Naoki3ae20562017-01-16 20:41:20 +090012167/*[clinic input]
12168str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169
INADA Naoki3ae20562017-01-16 20:41:20 +090012170Return True if the string is a digit string, False otherwise.
12171
12172A string is a digit string if all characters in the string are digits and there
12173is at least one character in the string.
12174[clinic start generated code]*/
12175
12176static PyObject *
12177unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012178/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 Py_ssize_t i, length;
12181 int kind;
12182 void *data;
12183
12184 if (PyUnicode_READY(self) == -1)
12185 return NULL;
12186 length = PyUnicode_GET_LENGTH(self);
12187 kind = PyUnicode_KIND(self);
12188 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (length == 1) {
12192 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12193 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012196 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012198 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 for (i = 0; i < length; i++) {
12201 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012202 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012204 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205}
12206
INADA Naoki3ae20562017-01-16 20:41:20 +090012207/*[clinic input]
12208str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
INADA Naoki3ae20562017-01-16 20:41:20 +090012210Return True if the string is a numeric string, False otherwise.
12211
12212A string is numeric if all characters in the string are numeric and there is at
12213least one character in the string.
12214[clinic start generated code]*/
12215
12216static PyObject *
12217unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012218/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 Py_ssize_t i, length;
12221 int kind;
12222 void *data;
12223
12224 if (PyUnicode_READY(self) == -1)
12225 return NULL;
12226 length = PyUnicode_GET_LENGTH(self);
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (length == 1)
12232 return PyBool_FromLong(
12233 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012235 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012237 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 for (i = 0; i < length; i++) {
12240 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012241 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012243 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244}
12245
Martin v. Löwis47383402007-08-15 07:32:56 +000012246int
12247PyUnicode_IsIdentifier(PyObject *self)
12248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012250 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012251
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012252 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12253 if (len == 0) {
12254 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 }
12257
Hai Shi3d235f52020-02-17 21:41:15 +080012258 int kind = 0;
12259 void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012260 const wchar_t *wstr = NULL;
12261 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012262 if (ready) {
12263 kind = PyUnicode_KIND(self);
12264 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012265 ch = PyUnicode_READ(kind, data, 0);
12266 }
12267 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012268 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012269 ch = wstr[0];
12270 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012271 /* PEP 3131 says that the first character must be in
12272 XID_Start and subsequent characters in XID_Continue,
12273 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012274 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012275 letters, digits, underscore). However, given the current
12276 definition of XID_Start and XID_Continue, it is sufficient
12277 to check just for these, except that _ must be allowed
12278 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012279 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012280 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012281 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012282
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012283 for (i = 1; i < len; i++) {
12284 if (ready) {
12285 ch = PyUnicode_READ(kind, data, i);
12286 }
12287 else {
12288 ch = wstr[i];
12289 }
12290 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012292 }
12293 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012294 return 1;
12295}
12296
INADA Naoki3ae20562017-01-16 20:41:20 +090012297/*[clinic input]
12298str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012299
INADA Naoki3ae20562017-01-16 20:41:20 +090012300Return True if the string is a valid Python identifier, False otherwise.
12301
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012302Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012303such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012304[clinic start generated code]*/
12305
12306static PyObject *
12307unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012308/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012309{
12310 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12311}
12312
INADA Naoki3ae20562017-01-16 20:41:20 +090012313/*[clinic input]
12314str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012315
INADA Naoki3ae20562017-01-16 20:41:20 +090012316Return True if the string is printable, False otherwise.
12317
12318A string is printable if all of its characters are considered printable in
12319repr() or if it is empty.
12320[clinic start generated code]*/
12321
12322static PyObject *
12323unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012324/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 Py_ssize_t i, length;
12327 int kind;
12328 void *data;
12329
12330 if (PyUnicode_READY(self) == -1)
12331 return NULL;
12332 length = PyUnicode_GET_LENGTH(self);
12333 kind = PyUnicode_KIND(self);
12334 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012335
12336 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 if (length == 1)
12338 return PyBool_FromLong(
12339 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 for (i = 0; i < length; i++) {
12342 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012343 Py_RETURN_FALSE;
12344 }
12345 }
12346 Py_RETURN_TRUE;
12347}
12348
INADA Naoki3ae20562017-01-16 20:41:20 +090012349/*[clinic input]
12350str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351
INADA Naoki3ae20562017-01-16 20:41:20 +090012352 iterable: object
12353 /
12354
12355Concatenate any number of strings.
12356
Martin Panter91a88662017-01-24 00:30:06 +000012357The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012358The result is returned as a new string.
12359
12360Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12361[clinic start generated code]*/
12362
12363static PyObject *
12364unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012365/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366{
INADA Naoki3ae20562017-01-16 20:41:20 +090012367 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368}
12369
Martin v. Löwis18e16552006-02-15 17:27:45 +000012370static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012371unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 if (PyUnicode_READY(self) == -1)
12374 return -1;
12375 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376}
12377
INADA Naoki3ae20562017-01-16 20:41:20 +090012378/*[clinic input]
12379str.ljust as unicode_ljust
12380
12381 width: Py_ssize_t
12382 fillchar: Py_UCS4 = ' '
12383 /
12384
12385Return a left-justified string of length width.
12386
12387Padding is done using the specified fill character (default is a space).
12388[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389
12390static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012391unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12392/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012394 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396
Victor Stinnerc4b49542011-12-11 22:44:26 +010012397 if (PyUnicode_GET_LENGTH(self) >= width)
12398 return unicode_result_unchanged(self);
12399
12400 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401}
12402
INADA Naoki3ae20562017-01-16 20:41:20 +090012403/*[clinic input]
12404str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405
INADA Naoki3ae20562017-01-16 20:41:20 +090012406Return a copy of the string converted to lowercase.
12407[clinic start generated code]*/
12408
12409static PyObject *
12410unicode_lower_impl(PyObject *self)
12411/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012413 if (PyUnicode_READY(self) == -1)
12414 return NULL;
12415 if (PyUnicode_IS_ASCII(self))
12416 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012417 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418}
12419
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420#define LEFTSTRIP 0
12421#define RIGHTSTRIP 1
12422#define BOTHSTRIP 2
12423
12424/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012425static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012426
INADA Naoki3ae20562017-01-16 20:41:20 +090012427#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012428
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012429/* externally visible for str.strip(unicode) */
12430PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012431_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 void *data;
12434 int kind;
12435 Py_ssize_t i, j, len;
12436 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012437 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12440 return NULL;
12441
12442 kind = PyUnicode_KIND(self);
12443 data = PyUnicode_DATA(self);
12444 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012445 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12447 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012448 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 i = 0;
12451 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012452 while (i < len) {
12453 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12454 if (!BLOOM(sepmask, ch))
12455 break;
12456 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12457 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 i++;
12459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012460 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 j = len;
12463 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012464 j--;
12465 while (j >= i) {
12466 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12467 if (!BLOOM(sepmask, ch))
12468 break;
12469 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12470 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012472 }
12473
Benjamin Peterson29060642009-01-31 22:14:21 +000012474 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476
Victor Stinner7931d9a2011-11-04 00:22:48 +010012477 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478}
12479
12480PyObject*
12481PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12482{
12483 unsigned char *data;
12484 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012485 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486
Victor Stinnerde636f32011-10-01 03:55:54 +020012487 if (PyUnicode_READY(self) == -1)
12488 return NULL;
12489
Victor Stinner684d5fd2012-05-03 02:32:34 +020012490 length = PyUnicode_GET_LENGTH(self);
12491 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012492
Victor Stinner684d5fd2012-05-03 02:32:34 +020012493 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012494 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495
Victor Stinnerde636f32011-10-01 03:55:54 +020012496 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012497 PyErr_SetString(PyExc_IndexError, "string index out of range");
12498 return NULL;
12499 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012500 if (start >= length || end < start)
12501 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012502
Victor Stinner684d5fd2012-05-03 02:32:34 +020012503 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012504 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012505 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012506 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012507 }
12508 else {
12509 kind = PyUnicode_KIND(self);
12510 data = PyUnicode_1BYTE_DATA(self);
12511 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012512 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012513 length);
12514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
12517static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012518do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 Py_ssize_t len, i, j;
12521
12522 if (PyUnicode_READY(self) == -1)
12523 return NULL;
12524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012526
Victor Stinnercc7af722013-04-09 22:39:24 +020012527 if (PyUnicode_IS_ASCII(self)) {
12528 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12529
12530 i = 0;
12531 if (striptype != RIGHTSTRIP) {
12532 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012533 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012534 if (!_Py_ascii_whitespace[ch])
12535 break;
12536 i++;
12537 }
12538 }
12539
12540 j = len;
12541 if (striptype != LEFTSTRIP) {
12542 j--;
12543 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012544 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012545 if (!_Py_ascii_whitespace[ch])
12546 break;
12547 j--;
12548 }
12549 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 }
12551 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012552 else {
12553 int kind = PyUnicode_KIND(self);
12554 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012555
Victor Stinnercc7af722013-04-09 22:39:24 +020012556 i = 0;
12557 if (striptype != RIGHTSTRIP) {
12558 while (i < len) {
12559 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12560 if (!Py_UNICODE_ISSPACE(ch))
12561 break;
12562 i++;
12563 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012564 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012565
12566 j = len;
12567 if (striptype != LEFTSTRIP) {
12568 j--;
12569 while (j >= i) {
12570 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12571 if (!Py_UNICODE_ISSPACE(ch))
12572 break;
12573 j--;
12574 }
12575 j++;
12576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012577 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012578
Victor Stinner7931d9a2011-11-04 00:22:48 +010012579 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580}
12581
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012582
12583static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012584do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012585{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012586 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012587 if (PyUnicode_Check(sep))
12588 return _PyUnicode_XStrip(self, striptype, sep);
12589 else {
12590 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 "%s arg must be None or str",
12592 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012593 return NULL;
12594 }
12595 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012596
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012598}
12599
12600
INADA Naoki3ae20562017-01-16 20:41:20 +090012601/*[clinic input]
12602str.strip as unicode_strip
12603
12604 chars: object = None
12605 /
12606
Zachary Ware09895c22019-10-09 16:09:00 -050012607Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012608
12609If chars is given and not None, remove characters in chars instead.
12610[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012611
12612static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012613unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012614/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012615{
INADA Naoki3ae20562017-01-16 20:41:20 +090012616 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012617}
12618
12619
INADA Naoki3ae20562017-01-16 20:41:20 +090012620/*[clinic input]
12621str.lstrip as unicode_lstrip
12622
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012623 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012624 /
12625
12626Return a copy of the string with leading whitespace removed.
12627
12628If chars is given and not None, remove characters in chars instead.
12629[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012630
12631static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012632unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012633/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012634{
INADA Naoki3ae20562017-01-16 20:41:20 +090012635 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012636}
12637
12638
INADA Naoki3ae20562017-01-16 20:41:20 +090012639/*[clinic input]
12640str.rstrip as unicode_rstrip
12641
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012642 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012643 /
12644
12645Return a copy of the string with trailing whitespace removed.
12646
12647If chars is given and not None, remove characters in chars instead.
12648[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012649
12650static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012651unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012652/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012653{
INADA Naoki3ae20562017-01-16 20:41:20 +090012654 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012655}
12656
12657
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012659unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012661 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663
Serhiy Storchaka05997252013-01-26 12:14:02 +020012664 if (len < 1)
12665 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666
Victor Stinnerc4b49542011-12-11 22:44:26 +010012667 /* no repeat, return original string */
12668 if (len == 1)
12669 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012670
Benjamin Petersonbac79492012-01-14 13:34:47 -050012671 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 return NULL;
12673
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012674 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012675 PyErr_SetString(PyExc_OverflowError,
12676 "repeated string is too long");
12677 return NULL;
12678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012680
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012681 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682 if (!u)
12683 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012684 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 if (PyUnicode_GET_LENGTH(str) == 1) {
12687 const int kind = PyUnicode_KIND(str);
12688 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012689 if (kind == PyUnicode_1BYTE_KIND) {
12690 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012691 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012692 }
12693 else if (kind == PyUnicode_2BYTE_KIND) {
12694 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012695 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012696 ucs2[n] = fill_char;
12697 } else {
12698 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12699 assert(kind == PyUnicode_4BYTE_KIND);
12700 for (n = 0; n < len; ++n)
12701 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 }
12704 else {
12705 /* number of characters copied this far */
12706 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012707 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012709 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012713 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012714 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716 }
12717
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012718 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012719 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
12721
Alexander Belopolsky40018472011-02-26 01:02:56 +000012722PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012723PyUnicode_Replace(PyObject *str,
12724 PyObject *substr,
12725 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012726 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012728 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12729 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012730 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012731 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732}
12733
INADA Naoki3ae20562017-01-16 20:41:20 +090012734/*[clinic input]
12735str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736
INADA Naoki3ae20562017-01-16 20:41:20 +090012737 old: unicode
12738 new: unicode
12739 count: Py_ssize_t = -1
12740 Maximum number of occurrences to replace.
12741 -1 (the default value) means replace all occurrences.
12742 /
12743
12744Return a copy with all occurrences of substring old replaced by new.
12745
12746If the optional argument count is given, only the first count occurrences are
12747replaced.
12748[clinic start generated code]*/
12749
12750static PyObject *
12751unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12752 Py_ssize_t count)
12753/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012754{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012755 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012757 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758}
12759
Alexander Belopolsky40018472011-02-26 01:02:56 +000012760static PyObject *
12761unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012763 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 Py_ssize_t isize;
12765 Py_ssize_t osize, squote, dquote, i, o;
12766 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012767 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012771 return NULL;
12772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 isize = PyUnicode_GET_LENGTH(unicode);
12774 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 /* Compute length of output, quote characters, and
12777 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012778 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 max = 127;
12780 squote = dquote = 0;
12781 ikind = PyUnicode_KIND(unicode);
12782 for (i = 0; i < isize; i++) {
12783 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012784 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012786 case '\'': squote++; break;
12787 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012789 incr = 2;
12790 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 default:
12792 /* Fast-path ASCII */
12793 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012794 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012796 ;
12797 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012800 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012802 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012804 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012806 if (osize > PY_SSIZE_T_MAX - incr) {
12807 PyErr_SetString(PyExc_OverflowError,
12808 "string is too long to generate repr");
12809 return NULL;
12810 }
12811 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 }
12813
12814 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012815 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012817 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 if (dquote)
12819 /* Both squote and dquote present. Use squote,
12820 and escape them */
12821 osize += squote;
12822 else
12823 quote = '"';
12824 }
Victor Stinner55c08782013-04-14 18:45:39 +020012825 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826
12827 repr = PyUnicode_New(osize, max);
12828 if (repr == NULL)
12829 return NULL;
12830 okind = PyUnicode_KIND(repr);
12831 odata = PyUnicode_DATA(repr);
12832
12833 PyUnicode_WRITE(okind, odata, 0, quote);
12834 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012835 if (unchanged) {
12836 _PyUnicode_FastCopyCharacters(repr, 1,
12837 unicode, 0,
12838 isize);
12839 }
12840 else {
12841 for (i = 0, o = 1; i < isize; i++) {
12842 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843
Victor Stinner55c08782013-04-14 18:45:39 +020012844 /* Escape quotes and backslashes */
12845 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012846 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012848 continue;
12849 }
12850
12851 /* Map special whitespace to '\t', \n', '\r' */
12852 if (ch == '\t') {
12853 PyUnicode_WRITE(okind, odata, o++, '\\');
12854 PyUnicode_WRITE(okind, odata, o++, 't');
12855 }
12856 else if (ch == '\n') {
12857 PyUnicode_WRITE(okind, odata, o++, '\\');
12858 PyUnicode_WRITE(okind, odata, o++, 'n');
12859 }
12860 else if (ch == '\r') {
12861 PyUnicode_WRITE(okind, odata, o++, '\\');
12862 PyUnicode_WRITE(okind, odata, o++, 'r');
12863 }
12864
12865 /* Map non-printable US ASCII to '\xhh' */
12866 else if (ch < ' ' || ch == 0x7F) {
12867 PyUnicode_WRITE(okind, odata, o++, '\\');
12868 PyUnicode_WRITE(okind, odata, o++, 'x');
12869 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12870 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12871 }
12872
12873 /* Copy ASCII characters as-is */
12874 else if (ch < 0x7F) {
12875 PyUnicode_WRITE(okind, odata, o++, ch);
12876 }
12877
12878 /* Non-ASCII characters */
12879 else {
12880 /* Map Unicode whitespace and control characters
12881 (categories Z* and C* except ASCII space)
12882 */
12883 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12884 PyUnicode_WRITE(okind, odata, o++, '\\');
12885 /* Map 8-bit characters to '\xhh' */
12886 if (ch <= 0xff) {
12887 PyUnicode_WRITE(okind, odata, o++, 'x');
12888 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12889 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12890 }
12891 /* Map 16-bit characters to '\uxxxx' */
12892 else if (ch <= 0xffff) {
12893 PyUnicode_WRITE(okind, odata, o++, 'u');
12894 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12895 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12896 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12897 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12898 }
12899 /* Map 21-bit characters to '\U00xxxxxx' */
12900 else {
12901 PyUnicode_WRITE(okind, odata, o++, 'U');
12902 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12903 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12904 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12905 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12906 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12907 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12908 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12909 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12910 }
12911 }
12912 /* Copy characters as-is */
12913 else {
12914 PyUnicode_WRITE(okind, odata, o++, ch);
12915 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012916 }
12917 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012920 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012921 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922}
12923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012924PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926\n\
12927Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012928such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929arguments start and end are interpreted as in slice notation.\n\
12930\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012931Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932
12933static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012936 /* initialize variables to prevent gcc warning */
12937 PyObject *substring = NULL;
12938 Py_ssize_t start = 0;
12939 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012940 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012942 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012945 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012948 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 if (result == -2)
12951 return NULL;
12952
Christian Heimes217cfd12007-12-02 14:31:20 +000012953 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954}
12955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012956PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012959Return the highest index in S where substring sub is found,\n\
12960such that sub is contained within S[start:end]. Optional\n\
12961arguments start and end are interpreted as in slice notation.\n\
12962\n\
12963Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964
12965static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012968 /* initialize variables to prevent gcc warning */
12969 PyObject *substring = NULL;
12970 Py_ssize_t start = 0;
12971 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012974 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012977 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (result == -2)
12983 return NULL;
12984
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985 if (result < 0) {
12986 PyErr_SetString(PyExc_ValueError, "substring not found");
12987 return NULL;
12988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989
Christian Heimes217cfd12007-12-02 14:31:20 +000012990 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991}
12992
INADA Naoki3ae20562017-01-16 20:41:20 +090012993/*[clinic input]
12994str.rjust as unicode_rjust
12995
12996 width: Py_ssize_t
12997 fillchar: Py_UCS4 = ' '
12998 /
12999
13000Return a right-justified string of length width.
13001
13002Padding is done using the specified fill character (default is a space).
13003[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004
13005static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013006unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13007/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013009 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010 return NULL;
13011
Victor Stinnerc4b49542011-12-11 22:44:26 +010013012 if (PyUnicode_GET_LENGTH(self) >= width)
13013 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014
Victor Stinnerc4b49542011-12-11 22:44:26 +010013015 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016}
13017
Alexander Belopolsky40018472011-02-26 01:02:56 +000013018PyObject *
13019PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013021 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013024 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013025}
13026
INADA Naoki3ae20562017-01-16 20:41:20 +090013027/*[clinic input]
13028str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029
INADA Naoki3ae20562017-01-16 20:41:20 +090013030 sep: object = None
13031 The delimiter according which to split the string.
13032 None (the default value) means split according to any whitespace,
13033 and discard empty strings from the result.
13034 maxsplit: Py_ssize_t = -1
13035 Maximum number of splits to do.
13036 -1 (the default value) means no limit.
13037
13038Return a list of the words in the string, using sep as the delimiter string.
13039[clinic start generated code]*/
13040
13041static PyObject *
13042unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13043/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044{
INADA Naoki3ae20562017-01-16 20:41:20 +090013045 if (sep == Py_None)
13046 return split(self, NULL, maxsplit);
13047 if (PyUnicode_Check(sep))
13048 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013049
Victor Stinner998b8062018-09-12 00:23:25 +020013050 PyErr_Format(PyExc_TypeError,
13051 "must be str or None, not %.100s",
13052 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054}
13055
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013057PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013058{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013059 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013060 int kind1, kind2;
13061 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013064 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013065 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013066
Victor Stinner14f8f022011-10-05 20:58:25 +020013067 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 len1 = PyUnicode_GET_LENGTH(str_obj);
13070 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013071 if (kind1 < kind2 || len1 < len2) {
13072 _Py_INCREF_UNICODE_EMPTY();
13073 if (!unicode_empty)
13074 out = NULL;
13075 else {
13076 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13077 Py_DECREF(unicode_empty);
13078 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013079 return out;
13080 }
13081 buf1 = PyUnicode_DATA(str_obj);
13082 buf2 = PyUnicode_DATA(sep_obj);
13083 if (kind2 != kind1) {
13084 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13085 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013086 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013088
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013089 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013091 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13092 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13093 else
13094 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095 break;
13096 case PyUnicode_2BYTE_KIND:
13097 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13098 break;
13099 case PyUnicode_4BYTE_KIND:
13100 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13101 break;
13102 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013103 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013105
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013106 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013108
13109 return out;
13110}
13111
13112
13113PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013114PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013115{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013116 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013117 int kind1, kind2;
13118 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013120
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013121 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013123
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013124 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 len1 = PyUnicode_GET_LENGTH(str_obj);
13127 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013128 if (kind1 < kind2 || len1 < len2) {
13129 _Py_INCREF_UNICODE_EMPTY();
13130 if (!unicode_empty)
13131 out = NULL;
13132 else {
13133 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13134 Py_DECREF(unicode_empty);
13135 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013136 return out;
13137 }
13138 buf1 = PyUnicode_DATA(str_obj);
13139 buf2 = PyUnicode_DATA(sep_obj);
13140 if (kind2 != kind1) {
13141 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13142 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013143 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013146 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013148 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13149 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13150 else
13151 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 break;
13153 case PyUnicode_2BYTE_KIND:
13154 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13155 break;
13156 case PyUnicode_4BYTE_KIND:
13157 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13158 break;
13159 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013160 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013162
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013163 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013165
13166 return out;
13167}
13168
INADA Naoki3ae20562017-01-16 20:41:20 +090013169/*[clinic input]
13170str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013171
INADA Naoki3ae20562017-01-16 20:41:20 +090013172 sep: object
13173 /
13174
13175Partition the string into three parts using the given separator.
13176
13177This will search for the separator in the string. If the separator is found,
13178returns a 3-tuple containing the part before the separator, the separator
13179itself, and the part after it.
13180
13181If the separator is not found, returns a 3-tuple containing the original string
13182and two empty strings.
13183[clinic start generated code]*/
13184
13185static PyObject *
13186unicode_partition(PyObject *self, PyObject *sep)
13187/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013188{
INADA Naoki3ae20562017-01-16 20:41:20 +090013189 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013190}
13191
INADA Naoki3ae20562017-01-16 20:41:20 +090013192/*[clinic input]
13193str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013194
INADA Naoki3ae20562017-01-16 20:41:20 +090013195Partition the string into three parts using the given separator.
13196
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013197This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013198the separator is found, returns a 3-tuple containing the part before the
13199separator, the separator itself, and the part after it.
13200
13201If the separator is not found, returns a 3-tuple containing two empty strings
13202and the original string.
13203[clinic start generated code]*/
13204
13205static PyObject *
13206unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013207/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013208{
INADA Naoki3ae20562017-01-16 20:41:20 +090013209 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013210}
13211
Alexander Belopolsky40018472011-02-26 01:02:56 +000013212PyObject *
13213PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013214{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013215 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013216 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013217
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013218 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013219}
13220
INADA Naoki3ae20562017-01-16 20:41:20 +090013221/*[clinic input]
13222str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013223
INADA Naoki3ae20562017-01-16 20:41:20 +090013224Return a list of the words in the string, using sep as the delimiter string.
13225
13226Splits are done starting at the end of the string and working to the front.
13227[clinic start generated code]*/
13228
13229static PyObject *
13230unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13231/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013232{
INADA Naoki3ae20562017-01-16 20:41:20 +090013233 if (sep == Py_None)
13234 return rsplit(self, NULL, maxsplit);
13235 if (PyUnicode_Check(sep))
13236 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013237
Victor Stinner998b8062018-09-12 00:23:25 +020013238 PyErr_Format(PyExc_TypeError,
13239 "must be str or None, not %.100s",
13240 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013242}
13243
INADA Naoki3ae20562017-01-16 20:41:20 +090013244/*[clinic input]
13245str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013247 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013248
13249Return a list of the lines in the string, breaking at line boundaries.
13250
13251Line breaks are not included in the resulting list unless keepends is given and
13252true.
13253[clinic start generated code]*/
13254
13255static PyObject *
13256unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013257/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013259 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260}
13261
13262static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013263PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013265 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266}
13267
INADA Naoki3ae20562017-01-16 20:41:20 +090013268/*[clinic input]
13269str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270
INADA Naoki3ae20562017-01-16 20:41:20 +090013271Convert uppercase characters to lowercase and lowercase characters to uppercase.
13272[clinic start generated code]*/
13273
13274static PyObject *
13275unicode_swapcase_impl(PyObject *self)
13276/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013278 if (PyUnicode_READY(self) == -1)
13279 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013280 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281}
13282
Larry Hastings61272b72014-01-07 12:41:53 -080013283/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013284
Larry Hastings31826802013-10-19 00:09:25 -070013285@staticmethod
13286str.maketrans as unicode_maketrans
13287
13288 x: object
13289
13290 y: unicode=NULL
13291
13292 z: unicode=NULL
13293
13294 /
13295
13296Return a translation table usable for str.translate().
13297
13298If there is only one argument, it must be a dictionary mapping Unicode
13299ordinals (integers) or characters to Unicode ordinals, strings or None.
13300Character keys will be then converted to ordinals.
13301If there are two arguments, they must be strings of equal length, and
13302in the resulting dictionary, each character in x will be mapped to the
13303character at the same position in y. If there is a third argument, it
13304must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013305[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013306
Larry Hastings31826802013-10-19 00:09:25 -070013307static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013308unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013309/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013310{
Georg Brandlceee0772007-11-27 23:48:05 +000013311 PyObject *new = NULL, *key, *value;
13312 Py_ssize_t i = 0;
13313 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013314
Georg Brandlceee0772007-11-27 23:48:05 +000013315 new = PyDict_New();
13316 if (!new)
13317 return NULL;
13318 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 int x_kind, y_kind, z_kind;
13320 void *x_data, *y_data, *z_data;
13321
Georg Brandlceee0772007-11-27 23:48:05 +000013322 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013323 if (!PyUnicode_Check(x)) {
13324 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13325 "be a string if there is a second argument");
13326 goto err;
13327 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013329 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13330 "arguments must have equal length");
13331 goto err;
13332 }
13333 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 x_kind = PyUnicode_KIND(x);
13335 y_kind = PyUnicode_KIND(y);
13336 x_data = PyUnicode_DATA(x);
13337 y_data = PyUnicode_DATA(y);
13338 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13339 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013340 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013341 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013342 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013343 if (!value) {
13344 Py_DECREF(key);
13345 goto err;
13346 }
Georg Brandlceee0772007-11-27 23:48:05 +000013347 res = PyDict_SetItem(new, key, value);
13348 Py_DECREF(key);
13349 Py_DECREF(value);
13350 if (res < 0)
13351 goto err;
13352 }
13353 /* create entries for deleting chars in z */
13354 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 z_kind = PyUnicode_KIND(z);
13356 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013357 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013359 if (!key)
13360 goto err;
13361 res = PyDict_SetItem(new, key, Py_None);
13362 Py_DECREF(key);
13363 if (res < 0)
13364 goto err;
13365 }
13366 }
13367 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 int kind;
13369 void *data;
13370
Georg Brandlceee0772007-11-27 23:48:05 +000013371 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013372 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013373 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13374 "to maketrans it must be a dict");
13375 goto err;
13376 }
13377 /* copy entries into the new dict, converting string keys to int keys */
13378 while (PyDict_Next(x, &i, &key, &value)) {
13379 if (PyUnicode_Check(key)) {
13380 /* convert string keys to integer keys */
13381 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013382 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013383 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13384 "table must be of length 1");
13385 goto err;
13386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 kind = PyUnicode_KIND(key);
13388 data = PyUnicode_DATA(key);
13389 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013390 if (!newkey)
13391 goto err;
13392 res = PyDict_SetItem(new, newkey, value);
13393 Py_DECREF(newkey);
13394 if (res < 0)
13395 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013396 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013397 /* just keep integer keys */
13398 if (PyDict_SetItem(new, key, value) < 0)
13399 goto err;
13400 } else {
13401 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13402 "be strings or integers");
13403 goto err;
13404 }
13405 }
13406 }
13407 return new;
13408 err:
13409 Py_DECREF(new);
13410 return NULL;
13411}
13412
INADA Naoki3ae20562017-01-16 20:41:20 +090013413/*[clinic input]
13414str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415
INADA Naoki3ae20562017-01-16 20:41:20 +090013416 table: object
13417 Translation table, which must be a mapping of Unicode ordinals to
13418 Unicode ordinals, strings, or None.
13419 /
13420
13421Replace each character in the string using the given translation table.
13422
13423The table must implement lookup/indexing via __getitem__, for instance a
13424dictionary or list. If this operation raises LookupError, the character is
13425left untouched. Characters mapped to None are deleted.
13426[clinic start generated code]*/
13427
13428static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013429unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013430/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013432 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433}
13434
INADA Naoki3ae20562017-01-16 20:41:20 +090013435/*[clinic input]
13436str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437
INADA Naoki3ae20562017-01-16 20:41:20 +090013438Return a copy of the string converted to uppercase.
13439[clinic start generated code]*/
13440
13441static PyObject *
13442unicode_upper_impl(PyObject *self)
13443/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013445 if (PyUnicode_READY(self) == -1)
13446 return NULL;
13447 if (PyUnicode_IS_ASCII(self))
13448 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013449 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450}
13451
INADA Naoki3ae20562017-01-16 20:41:20 +090013452/*[clinic input]
13453str.zfill as unicode_zfill
13454
13455 width: Py_ssize_t
13456 /
13457
13458Pad a numeric string with zeros on the left, to fill a field of the given width.
13459
13460The string is never truncated.
13461[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013462
13463static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013464unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013465/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013467 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013468 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 int kind;
13470 void *data;
13471 Py_UCS4 chr;
13472
Benjamin Petersonbac79492012-01-14 13:34:47 -050013473 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475
Victor Stinnerc4b49542011-12-11 22:44:26 +010013476 if (PyUnicode_GET_LENGTH(self) >= width)
13477 return unicode_result_unchanged(self);
13478
13479 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013480
13481 u = pad(self, fill, 0, '0');
13482
Walter Dörwald068325e2002-04-15 13:36:47 +000013483 if (u == NULL)
13484 return NULL;
13485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 kind = PyUnicode_KIND(u);
13487 data = PyUnicode_DATA(u);
13488 chr = PyUnicode_READ(kind, data, fill);
13489
13490 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013491 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 PyUnicode_WRITE(kind, data, 0, chr);
13493 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013494 }
13495
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013496 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013497 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013498}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013499
13500#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013501static PyObject *
13502unicode__decimal2ascii(PyObject *self)
13503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013505}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013506#endif
13507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013508PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013511Return True if S starts with the specified prefix, False otherwise.\n\
13512With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013513With optional end, stop comparing S at that position.\n\
13514prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515
13516static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013517unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013518 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013520 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013521 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013522 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013523 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013524 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525
Jesus Ceaac451502011-04-20 17:09:23 +020013526 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013528 if (PyTuple_Check(subobj)) {
13529 Py_ssize_t i;
13530 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013531 substring = PyTuple_GET_ITEM(subobj, i);
13532 if (!PyUnicode_Check(substring)) {
13533 PyErr_Format(PyExc_TypeError,
13534 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013535 "not %.100s",
13536 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013537 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013538 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013539 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013540 if (result == -1)
13541 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013542 if (result) {
13543 Py_RETURN_TRUE;
13544 }
13545 }
13546 /* nothing matched */
13547 Py_RETURN_FALSE;
13548 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013549 if (!PyUnicode_Check(subobj)) {
13550 PyErr_Format(PyExc_TypeError,
13551 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013552 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013554 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013555 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013556 if (result == -1)
13557 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013558 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559}
13560
13561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013562PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013564\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013565Return True if S ends with the specified suffix, False otherwise.\n\
13566With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013567With optional end, stop comparing S at that position.\n\
13568suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569
13570static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013571unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013573{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013574 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013575 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013576 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013577 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013578 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013579
Jesus Ceaac451502011-04-20 17:09:23 +020013580 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013582 if (PyTuple_Check(subobj)) {
13583 Py_ssize_t i;
13584 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013585 substring = PyTuple_GET_ITEM(subobj, i);
13586 if (!PyUnicode_Check(substring)) {
13587 PyErr_Format(PyExc_TypeError,
13588 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013589 "not %.100s",
13590 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013592 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013593 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013594 if (result == -1)
13595 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013596 if (result) {
13597 Py_RETURN_TRUE;
13598 }
13599 }
13600 Py_RETURN_FALSE;
13601 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013602 if (!PyUnicode_Check(subobj)) {
13603 PyErr_Format(PyExc_TypeError,
13604 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013605 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013607 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013608 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013609 if (result == -1)
13610 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013611 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013612}
13613
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013614static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013615_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013616{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013617 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13618 writer->data = PyUnicode_DATA(writer->buffer);
13619
13620 if (!writer->readonly) {
13621 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013622 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013623 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013624 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013625 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13626 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13627 writer->kind = PyUnicode_WCHAR_KIND;
13628 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13629
Victor Stinner8f674cc2013-04-17 23:02:17 +020013630 /* Copy-on-write mode: set buffer size to 0 so
13631 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13632 * next write. */
13633 writer->size = 0;
13634 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013635}
13636
Victor Stinnerd3f08822012-05-29 12:57:52 +020013637void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013638_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013639{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013641
13642 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013643 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013644
13645 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13646 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13647 writer->kind = PyUnicode_WCHAR_KIND;
13648 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013649}
13650
Inada Naoki770847a2019-06-24 12:30:24 +090013651// Initialize _PyUnicodeWriter with initial buffer
13652static inline void
13653_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13654{
13655 memset(writer, 0, sizeof(*writer));
13656 writer->buffer = buffer;
13657 _PyUnicodeWriter_Update(writer);
13658 writer->min_length = writer->size;
13659}
13660
Victor Stinnerd3f08822012-05-29 12:57:52 +020013661int
13662_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13663 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013664{
13665 Py_ssize_t newlen;
13666 PyObject *newbuffer;
13667
Victor Stinner2740e462016-09-06 16:58:36 -070013668 assert(maxchar <= MAX_UNICODE);
13669
Victor Stinnerca9381e2015-09-22 00:58:32 +020013670 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013671 assert((maxchar > writer->maxchar && length >= 0)
13672 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013673
Victor Stinner202fdca2012-05-07 12:47:02 +020013674 if (length > PY_SSIZE_T_MAX - writer->pos) {
13675 PyErr_NoMemory();
13676 return -1;
13677 }
13678 newlen = writer->pos + length;
13679
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013680 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013681
Victor Stinnerd3f08822012-05-29 12:57:52 +020013682 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013683 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013684 if (writer->overallocate
13685 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13686 /* overallocate to limit the number of realloc() */
13687 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013688 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013689 if (newlen < writer->min_length)
13690 newlen = writer->min_length;
13691
Victor Stinnerd3f08822012-05-29 12:57:52 +020013692 writer->buffer = PyUnicode_New(newlen, maxchar);
13693 if (writer->buffer == NULL)
13694 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013695 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013696 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013697 if (writer->overallocate
13698 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13699 /* overallocate to limit the number of realloc() */
13700 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013701 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013702 if (newlen < writer->min_length)
13703 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013704
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013705 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013706 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013707 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013708 newbuffer = PyUnicode_New(newlen, maxchar);
13709 if (newbuffer == NULL)
13710 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013711 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13712 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013713 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013714 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013715 }
13716 else {
13717 newbuffer = resize_compact(writer->buffer, newlen);
13718 if (newbuffer == NULL)
13719 return -1;
13720 }
13721 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013722 }
13723 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013724 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013725 newbuffer = PyUnicode_New(writer->size, maxchar);
13726 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013727 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013728 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13729 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013730 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013731 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013732 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013733 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013734
13735#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013736}
13737
Victor Stinnerca9381e2015-09-22 00:58:32 +020013738int
13739_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13740 enum PyUnicode_Kind kind)
13741{
13742 Py_UCS4 maxchar;
13743
13744 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13745 assert(writer->kind < kind);
13746
13747 switch (kind)
13748 {
13749 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13750 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13751 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13752 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013753 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013754 }
13755
13756 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13757}
13758
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013759static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013760_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013761{
Victor Stinner2740e462016-09-06 16:58:36 -070013762 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013763 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13764 return -1;
13765 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13766 writer->pos++;
13767 return 0;
13768}
13769
13770int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013771_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13772{
13773 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13774}
13775
13776int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013777_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13778{
13779 Py_UCS4 maxchar;
13780 Py_ssize_t len;
13781
13782 if (PyUnicode_READY(str) == -1)
13783 return -1;
13784 len = PyUnicode_GET_LENGTH(str);
13785 if (len == 0)
13786 return 0;
13787 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13788 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013789 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013790 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013791 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013792 Py_INCREF(str);
13793 writer->buffer = str;
13794 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013795 writer->pos += len;
13796 return 0;
13797 }
13798 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13799 return -1;
13800 }
13801 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13802 str, 0, len);
13803 writer->pos += len;
13804 return 0;
13805}
13806
Victor Stinnere215d962012-10-06 23:03:36 +020013807int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013808_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13809 Py_ssize_t start, Py_ssize_t end)
13810{
13811 Py_UCS4 maxchar;
13812 Py_ssize_t len;
13813
13814 if (PyUnicode_READY(str) == -1)
13815 return -1;
13816
13817 assert(0 <= start);
13818 assert(end <= PyUnicode_GET_LENGTH(str));
13819 assert(start <= end);
13820
13821 if (end == 0)
13822 return 0;
13823
13824 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13825 return _PyUnicodeWriter_WriteStr(writer, str);
13826
13827 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13828 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13829 else
13830 maxchar = writer->maxchar;
13831 len = end - start;
13832
13833 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13834 return -1;
13835
13836 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13837 str, start, len);
13838 writer->pos += len;
13839 return 0;
13840}
13841
13842int
Victor Stinner4a587072013-11-19 12:54:53 +010013843_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13844 const char *ascii, Py_ssize_t len)
13845{
13846 if (len == -1)
13847 len = strlen(ascii);
13848
Andy Lestere6be9b52020-02-11 20:28:35 -060013849 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013850
13851 if (writer->buffer == NULL && !writer->overallocate) {
13852 PyObject *str;
13853
13854 str = _PyUnicode_FromASCII(ascii, len);
13855 if (str == NULL)
13856 return -1;
13857
13858 writer->readonly = 1;
13859 writer->buffer = str;
13860 _PyUnicodeWriter_Update(writer);
13861 writer->pos += len;
13862 return 0;
13863 }
13864
13865 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13866 return -1;
13867
13868 switch (writer->kind)
13869 {
13870 case PyUnicode_1BYTE_KIND:
13871 {
13872 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13873 Py_UCS1 *data = writer->data;
13874
Christian Heimesf051e432016-09-13 20:22:02 +020013875 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013876 break;
13877 }
13878 case PyUnicode_2BYTE_KIND:
13879 {
13880 _PyUnicode_CONVERT_BYTES(
13881 Py_UCS1, Py_UCS2,
13882 ascii, ascii + len,
13883 (Py_UCS2 *)writer->data + writer->pos);
13884 break;
13885 }
13886 case PyUnicode_4BYTE_KIND:
13887 {
13888 _PyUnicode_CONVERT_BYTES(
13889 Py_UCS1, Py_UCS4,
13890 ascii, ascii + len,
13891 (Py_UCS4 *)writer->data + writer->pos);
13892 break;
13893 }
13894 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013895 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013896 }
13897
13898 writer->pos += len;
13899 return 0;
13900}
13901
13902int
13903_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13904 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013905{
13906 Py_UCS4 maxchar;
13907
Andy Lestere6be9b52020-02-11 20:28:35 -060013908 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013909 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13910 return -1;
13911 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13912 writer->pos += len;
13913 return 0;
13914}
13915
Victor Stinnerd3f08822012-05-29 12:57:52 +020013916PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013917_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013918{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013919 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013920
Victor Stinnerd3f08822012-05-29 12:57:52 +020013921 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013922 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013924 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013925
13926 str = writer->buffer;
13927 writer->buffer = NULL;
13928
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013929 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013930 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13931 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013933
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013934 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13935 PyObject *str2;
13936 str2 = resize_compact(str, writer->pos);
13937 if (str2 == NULL) {
13938 Py_DECREF(str);
13939 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013940 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013941 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013942 }
13943
Victor Stinner15a0bd32013-07-08 22:29:55 +020013944 assert(_PyUnicode_CheckConsistency(str, 1));
13945 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013946}
13947
Victor Stinnerd3f08822012-05-29 12:57:52 +020013948void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013949_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013950{
13951 Py_CLEAR(writer->buffer);
13952}
13953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013955
13956PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013958\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013959Return a formatted version of S, using substitutions from args and kwargs.\n\
13960The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013961
Eric Smith27bbca62010-11-04 17:06:58 +000013962PyDoc_STRVAR(format_map__doc__,
13963 "S.format_map(mapping) -> str\n\
13964\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013965Return a formatted version of S, using substitutions from mapping.\n\
13966The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013967
INADA Naoki3ae20562017-01-16 20:41:20 +090013968/*[clinic input]
13969str.__format__ as unicode___format__
13970
13971 format_spec: unicode
13972 /
13973
13974Return a formatted version of the string as described by format_spec.
13975[clinic start generated code]*/
13976
Eric Smith4a7d76d2008-05-30 18:10:19 +000013977static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013978unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013979/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013980{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013981 _PyUnicodeWriter writer;
13982 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013983
Victor Stinnerd3f08822012-05-29 12:57:52 +020013984 if (PyUnicode_READY(self) == -1)
13985 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013986 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013987 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13988 self, format_spec, 0,
13989 PyUnicode_GET_LENGTH(format_spec));
13990 if (ret == -1) {
13991 _PyUnicodeWriter_Dealloc(&writer);
13992 return NULL;
13993 }
13994 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013995}
13996
INADA Naoki3ae20562017-01-16 20:41:20 +090013997/*[clinic input]
13998str.__sizeof__ as unicode_sizeof
13999
14000Return the size of the string in memory, in bytes.
14001[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014002
14003static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014004unicode_sizeof_impl(PyObject *self)
14005/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014006{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014007 Py_ssize_t size;
14008
14009 /* If it's a compact object, account for base structure +
14010 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014011 if (PyUnicode_IS_COMPACT_ASCII(self))
14012 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14013 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014014 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014015 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014016 else {
14017 /* If it is a two-block object, account for base object, and
14018 for character block if present. */
14019 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014020 if (_PyUnicode_DATA_ANY(self))
14021 size += (PyUnicode_GET_LENGTH(self) + 1) *
14022 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014023 }
14024 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014025 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014026 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14027 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14028 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14029 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014030
14031 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014032}
14033
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014034static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014035unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014036{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014037 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014038 if (!copy)
14039 return NULL;
14040 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014041}
14042
Guido van Rossumd57fd912000-03-10 22:53:23 +000014043static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014044 UNICODE_ENCODE_METHODDEF
14045 UNICODE_REPLACE_METHODDEF
14046 UNICODE_SPLIT_METHODDEF
14047 UNICODE_RSPLIT_METHODDEF
14048 UNICODE_JOIN_METHODDEF
14049 UNICODE_CAPITALIZE_METHODDEF
14050 UNICODE_CASEFOLD_METHODDEF
14051 UNICODE_TITLE_METHODDEF
14052 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014053 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014054 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014055 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014056 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014057 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014058 UNICODE_LJUST_METHODDEF
14059 UNICODE_LOWER_METHODDEF
14060 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014061 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14062 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014063 UNICODE_RJUST_METHODDEF
14064 UNICODE_RSTRIP_METHODDEF
14065 UNICODE_RPARTITION_METHODDEF
14066 UNICODE_SPLITLINES_METHODDEF
14067 UNICODE_STRIP_METHODDEF
14068 UNICODE_SWAPCASE_METHODDEF
14069 UNICODE_TRANSLATE_METHODDEF
14070 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014071 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14072 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014073 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014074 UNICODE_ISLOWER_METHODDEF
14075 UNICODE_ISUPPER_METHODDEF
14076 UNICODE_ISTITLE_METHODDEF
14077 UNICODE_ISSPACE_METHODDEF
14078 UNICODE_ISDECIMAL_METHODDEF
14079 UNICODE_ISDIGIT_METHODDEF
14080 UNICODE_ISNUMERIC_METHODDEF
14081 UNICODE_ISALPHA_METHODDEF
14082 UNICODE_ISALNUM_METHODDEF
14083 UNICODE_ISIDENTIFIER_METHODDEF
14084 UNICODE_ISPRINTABLE_METHODDEF
14085 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014086 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014087 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014088 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014089 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014090 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014091#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014092 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014093 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094#endif
14095
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014096 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097 {NULL, NULL}
14098};
14099
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014100static PyObject *
14101unicode_mod(PyObject *v, PyObject *w)
14102{
Brian Curtindfc80e32011-08-10 20:28:54 -050014103 if (!PyUnicode_Check(v))
14104 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014106}
14107
14108static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 0, /*nb_add*/
14110 0, /*nb_subtract*/
14111 0, /*nb_multiply*/
14112 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014113};
14114
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014116 (lenfunc) unicode_length, /* sq_length */
14117 PyUnicode_Concat, /* sq_concat */
14118 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14119 (ssizeargfunc) unicode_getitem, /* sq_item */
14120 0, /* sq_slice */
14121 0, /* sq_ass_item */
14122 0, /* sq_ass_slice */
14123 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124};
14125
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014126static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014127unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014129 if (PyUnicode_READY(self) == -1)
14130 return NULL;
14131
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014132 if (PyIndex_Check(item)) {
14133 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014134 if (i == -1 && PyErr_Occurred())
14135 return NULL;
14136 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014137 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014138 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014139 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014140 Py_ssize_t start, stop, step, slicelength, i;
14141 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014142 PyObject *result;
14143 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014144 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014145 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014146
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014147 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014148 return NULL;
14149 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014150 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14151 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014152
14153 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014154 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014155 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014156 slicelength == PyUnicode_GET_LENGTH(self)) {
14157 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014158 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014159 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014160 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014161 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014162 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014163 src_kind = PyUnicode_KIND(self);
14164 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014165 if (!PyUnicode_IS_ASCII(self)) {
14166 kind_limit = kind_maxchar_limit(src_kind);
14167 max_char = 0;
14168 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14169 ch = PyUnicode_READ(src_kind, src_data, cur);
14170 if (ch > max_char) {
14171 max_char = ch;
14172 if (max_char >= kind_limit)
14173 break;
14174 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014175 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014176 }
Victor Stinner55c99112011-10-13 01:17:06 +020014177 else
14178 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014179 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014180 if (result == NULL)
14181 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014182 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014183 dest_data = PyUnicode_DATA(result);
14184
14185 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014186 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14187 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014188 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014189 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014190 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014191 } else {
14192 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14193 return NULL;
14194 }
14195}
14196
14197static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014198 (lenfunc)unicode_length, /* mp_length */
14199 (binaryfunc)unicode_subscript, /* mp_subscript */
14200 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014201};
14202
Guido van Rossumd57fd912000-03-10 22:53:23 +000014203
Guido van Rossumd57fd912000-03-10 22:53:23 +000014204/* Helpers for PyUnicode_Format() */
14205
Victor Stinnera47082312012-10-04 02:19:54 +020014206struct unicode_formatter_t {
14207 PyObject *args;
14208 int args_owned;
14209 Py_ssize_t arglen, argidx;
14210 PyObject *dict;
14211
14212 enum PyUnicode_Kind fmtkind;
14213 Py_ssize_t fmtcnt, fmtpos;
14214 void *fmtdata;
14215 PyObject *fmtstr;
14216
14217 _PyUnicodeWriter writer;
14218};
14219
14220struct unicode_format_arg_t {
14221 Py_UCS4 ch;
14222 int flags;
14223 Py_ssize_t width;
14224 int prec;
14225 int sign;
14226};
14227
Guido van Rossumd57fd912000-03-10 22:53:23 +000014228static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014229unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014230{
Victor Stinnera47082312012-10-04 02:19:54 +020014231 Py_ssize_t argidx = ctx->argidx;
14232
14233 if (argidx < ctx->arglen) {
14234 ctx->argidx++;
14235 if (ctx->arglen < 0)
14236 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014237 else
Victor Stinnera47082312012-10-04 02:19:54 +020014238 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014239 }
14240 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014241 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242 return NULL;
14243}
14244
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014245/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014246
Victor Stinnera47082312012-10-04 02:19:54 +020014247/* Format a float into the writer if the writer is not NULL, or into *p_output
14248 otherwise.
14249
14250 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014251static int
Victor Stinnera47082312012-10-04 02:19:54 +020014252formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14253 PyObject **p_output,
14254 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014255{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014256 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014257 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014258 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014259 int prec;
14260 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014261
Guido van Rossumd57fd912000-03-10 22:53:23 +000014262 x = PyFloat_AsDouble(v);
14263 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014264 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014265
Victor Stinnera47082312012-10-04 02:19:54 +020014266 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014267 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014268 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014269
Victor Stinnera47082312012-10-04 02:19:54 +020014270 if (arg->flags & F_ALT)
14271 dtoa_flags = Py_DTSF_ALT;
14272 else
14273 dtoa_flags = 0;
14274 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014275 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014276 return -1;
14277 len = strlen(p);
14278 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014279 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014280 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014281 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014282 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014283 }
14284 else
14285 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014286 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014287 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014288}
14289
Victor Stinnerd0880d52012-04-27 23:40:13 +020014290/* formatlong() emulates the format codes d, u, o, x and X, and
14291 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14292 * Python's regular ints.
14293 * Return value: a new PyUnicodeObject*, or NULL if error.
14294 * The output string is of the form
14295 * "-"? ("0x" | "0X")? digit+
14296 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14297 * set in flags. The case of hex digits will be correct,
14298 * There will be at least prec digits, zero-filled on the left if
14299 * necessary to get that many.
14300 * val object to be converted
14301 * flags bitmask of format flags; only F_ALT is looked at
14302 * prec minimum number of digits; 0-fill on left if needed
14303 * type a character in [duoxX]; u acts the same as d
14304 *
14305 * CAUTION: o, x and X conversions on regular ints can never
14306 * produce a '-' sign, but can for Python's unbounded ints.
14307 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014308PyObject *
14309_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014310{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014311 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014312 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014313 Py_ssize_t i;
14314 int sign; /* 1 if '-', else 0 */
14315 int len; /* number of characters */
14316 Py_ssize_t llen;
14317 int numdigits; /* len == numnondigits + numdigits */
14318 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014319
Victor Stinnerd0880d52012-04-27 23:40:13 +020014320 /* Avoid exceeding SSIZE_T_MAX */
14321 if (prec > INT_MAX-3) {
14322 PyErr_SetString(PyExc_OverflowError,
14323 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014324 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014325 }
14326
14327 assert(PyLong_Check(val));
14328
14329 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014330 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014331 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014332 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014333 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014334 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014335 /* int and int subclasses should print numerically when a numeric */
14336 /* format code is used (see issue18780) */
14337 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014338 break;
14339 case 'o':
14340 numnondigits = 2;
14341 result = PyNumber_ToBase(val, 8);
14342 break;
14343 case 'x':
14344 case 'X':
14345 numnondigits = 2;
14346 result = PyNumber_ToBase(val, 16);
14347 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014348 }
14349 if (!result)
14350 return NULL;
14351
14352 assert(unicode_modifiable(result));
14353 assert(PyUnicode_IS_READY(result));
14354 assert(PyUnicode_IS_ASCII(result));
14355
14356 /* To modify the string in-place, there can only be one reference. */
14357 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014358 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014359 PyErr_BadInternalCall();
14360 return NULL;
14361 }
14362 buf = PyUnicode_DATA(result);
14363 llen = PyUnicode_GET_LENGTH(result);
14364 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014365 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014366 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014367 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014368 return NULL;
14369 }
14370 len = (int)llen;
14371 sign = buf[0] == '-';
14372 numnondigits += sign;
14373 numdigits = len - numnondigits;
14374 assert(numdigits > 0);
14375
14376 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014377 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014378 (type == 'o' || type == 'x' || type == 'X'))) {
14379 assert(buf[sign] == '0');
14380 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14381 buf[sign+1] == 'o');
14382 numnondigits -= 2;
14383 buf += 2;
14384 len -= 2;
14385 if (sign)
14386 buf[0] = '-';
14387 assert(len == numnondigits + numdigits);
14388 assert(numdigits > 0);
14389 }
14390
14391 /* Fill with leading zeroes to meet minimum width. */
14392 if (prec > numdigits) {
14393 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14394 numnondigits + prec);
14395 char *b1;
14396 if (!r1) {
14397 Py_DECREF(result);
14398 return NULL;
14399 }
14400 b1 = PyBytes_AS_STRING(r1);
14401 for (i = 0; i < numnondigits; ++i)
14402 *b1++ = *buf++;
14403 for (i = 0; i < prec - numdigits; i++)
14404 *b1++ = '0';
14405 for (i = 0; i < numdigits; i++)
14406 *b1++ = *buf++;
14407 *b1 = '\0';
14408 Py_DECREF(result);
14409 result = r1;
14410 buf = PyBytes_AS_STRING(result);
14411 len = numnondigits + prec;
14412 }
14413
14414 /* Fix up case for hex conversions. */
14415 if (type == 'X') {
14416 /* Need to convert all lower case letters to upper case.
14417 and need to convert 0x to 0X (and -0x to -0X). */
14418 for (i = 0; i < len; i++)
14419 if (buf[i] >= 'a' && buf[i] <= 'x')
14420 buf[i] -= 'a'-'A';
14421 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014422 if (!PyUnicode_Check(result)
14423 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014424 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014425 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014426 Py_DECREF(result);
14427 result = unicode;
14428 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014429 else if (len != PyUnicode_GET_LENGTH(result)) {
14430 if (PyUnicode_Resize(&result, len) < 0)
14431 Py_CLEAR(result);
14432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014434}
14435
Ethan Furmandf3ed242014-01-05 06:50:30 -080014436/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014437 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014438 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014439 * -1 and raise an exception on error */
14440static int
Victor Stinnera47082312012-10-04 02:19:54 +020014441mainformatlong(PyObject *v,
14442 struct unicode_format_arg_t *arg,
14443 PyObject **p_output,
14444 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014445{
14446 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014447 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014448
14449 if (!PyNumber_Check(v))
14450 goto wrongtype;
14451
Ethan Furman9ab74802014-03-21 06:38:46 -070014452 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014453 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014454 if (type == 'o' || type == 'x' || type == 'X') {
14455 iobj = PyNumber_Index(v);
14456 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014457 if (PyErr_ExceptionMatches(PyExc_TypeError))
14458 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014459 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014460 }
14461 }
14462 else {
14463 iobj = PyNumber_Long(v);
14464 if (iobj == NULL ) {
14465 if (PyErr_ExceptionMatches(PyExc_TypeError))
14466 goto wrongtype;
14467 return -1;
14468 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014469 }
14470 assert(PyLong_Check(iobj));
14471 }
14472 else {
14473 iobj = v;
14474 Py_INCREF(iobj);
14475 }
14476
14477 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014478 && arg->width == -1 && arg->prec == -1
14479 && !(arg->flags & (F_SIGN | F_BLANK))
14480 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014481 {
14482 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014483 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014484 int base;
14485
Victor Stinnera47082312012-10-04 02:19:54 +020014486 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014487 {
14488 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014489 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014490 case 'd':
14491 case 'i':
14492 case 'u':
14493 base = 10;
14494 break;
14495 case 'o':
14496 base = 8;
14497 break;
14498 case 'x':
14499 case 'X':
14500 base = 16;
14501 break;
14502 }
14503
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014504 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14505 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014506 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014507 }
14508 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014509 return 1;
14510 }
14511
Ethan Furmanb95b5612015-01-23 20:05:18 -080014512 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014513 Py_DECREF(iobj);
14514 if (res == NULL)
14515 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014516 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014517 return 0;
14518
14519wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014520 switch(type)
14521 {
14522 case 'o':
14523 case 'x':
14524 case 'X':
14525 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014526 "%%%c format: an integer is required, "
14527 "not %.200s",
14528 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014529 break;
14530 default:
14531 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014532 "%%%c format: a number is required, "
14533 "not %.200s",
14534 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014535 break;
14536 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014537 return -1;
14538}
14539
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014540static Py_UCS4
14541formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014542{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014543 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014544 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014545 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014546 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014547 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014548 goto onError;
14549 }
14550 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014551 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014552 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014553 /* make sure number is a type of integer */
14554 if (!PyLong_Check(v)) {
14555 iobj = PyNumber_Index(v);
14556 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014557 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014558 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014559 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014560 Py_DECREF(iobj);
14561 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014562 else {
14563 x = PyLong_AsLong(v);
14564 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014565 if (x == -1 && PyErr_Occurred())
14566 goto onError;
14567
Victor Stinner8faf8212011-12-08 22:14:11 +010014568 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014569 PyErr_SetString(PyExc_OverflowError,
14570 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014571 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014572 }
14573
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014574 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014575 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014576
Benjamin Peterson29060642009-01-31 22:14:21 +000014577 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014578 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014579 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014580 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014581}
14582
Victor Stinnera47082312012-10-04 02:19:54 +020014583/* Parse options of an argument: flags, width, precision.
14584 Handle also "%(name)" syntax.
14585
14586 Return 0 if the argument has been formatted into arg->str.
14587 Return 1 if the argument has been written into ctx->writer,
14588 Raise an exception and return -1 on error. */
14589static int
14590unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14591 struct unicode_format_arg_t *arg)
14592{
14593#define FORMAT_READ(ctx) \
14594 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14595
14596 PyObject *v;
14597
Victor Stinnera47082312012-10-04 02:19:54 +020014598 if (arg->ch == '(') {
14599 /* Get argument value from a dictionary. Example: "%(name)s". */
14600 Py_ssize_t keystart;
14601 Py_ssize_t keylen;
14602 PyObject *key;
14603 int pcount = 1;
14604
14605 if (ctx->dict == NULL) {
14606 PyErr_SetString(PyExc_TypeError,
14607 "format requires a mapping");
14608 return -1;
14609 }
14610 ++ctx->fmtpos;
14611 --ctx->fmtcnt;
14612 keystart = ctx->fmtpos;
14613 /* Skip over balanced parentheses */
14614 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14615 arg->ch = FORMAT_READ(ctx);
14616 if (arg->ch == ')')
14617 --pcount;
14618 else if (arg->ch == '(')
14619 ++pcount;
14620 ctx->fmtpos++;
14621 }
14622 keylen = ctx->fmtpos - keystart - 1;
14623 if (ctx->fmtcnt < 0 || pcount > 0) {
14624 PyErr_SetString(PyExc_ValueError,
14625 "incomplete format key");
14626 return -1;
14627 }
14628 key = PyUnicode_Substring(ctx->fmtstr,
14629 keystart, keystart + keylen);
14630 if (key == NULL)
14631 return -1;
14632 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014633 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014634 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014635 }
14636 ctx->args = PyObject_GetItem(ctx->dict, key);
14637 Py_DECREF(key);
14638 if (ctx->args == NULL)
14639 return -1;
14640 ctx->args_owned = 1;
14641 ctx->arglen = -1;
14642 ctx->argidx = -2;
14643 }
14644
14645 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014646 while (--ctx->fmtcnt >= 0) {
14647 arg->ch = FORMAT_READ(ctx);
14648 ctx->fmtpos++;
14649 switch (arg->ch) {
14650 case '-': arg->flags |= F_LJUST; continue;
14651 case '+': arg->flags |= F_SIGN; continue;
14652 case ' ': arg->flags |= F_BLANK; continue;
14653 case '#': arg->flags |= F_ALT; continue;
14654 case '0': arg->flags |= F_ZERO; continue;
14655 }
14656 break;
14657 }
14658
14659 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014660 if (arg->ch == '*') {
14661 v = unicode_format_getnextarg(ctx);
14662 if (v == NULL)
14663 return -1;
14664 if (!PyLong_Check(v)) {
14665 PyErr_SetString(PyExc_TypeError,
14666 "* wants int");
14667 return -1;
14668 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014669 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014670 if (arg->width == -1 && PyErr_Occurred())
14671 return -1;
14672 if (arg->width < 0) {
14673 arg->flags |= F_LJUST;
14674 arg->width = -arg->width;
14675 }
14676 if (--ctx->fmtcnt >= 0) {
14677 arg->ch = FORMAT_READ(ctx);
14678 ctx->fmtpos++;
14679 }
14680 }
14681 else if (arg->ch >= '0' && arg->ch <= '9') {
14682 arg->width = arg->ch - '0';
14683 while (--ctx->fmtcnt >= 0) {
14684 arg->ch = FORMAT_READ(ctx);
14685 ctx->fmtpos++;
14686 if (arg->ch < '0' || arg->ch > '9')
14687 break;
14688 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14689 mixing signed and unsigned comparison. Since arg->ch is between
14690 '0' and '9', casting to int is safe. */
14691 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14692 PyErr_SetString(PyExc_ValueError,
14693 "width too big");
14694 return -1;
14695 }
14696 arg->width = arg->width*10 + (arg->ch - '0');
14697 }
14698 }
14699
14700 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014701 if (arg->ch == '.') {
14702 arg->prec = 0;
14703 if (--ctx->fmtcnt >= 0) {
14704 arg->ch = FORMAT_READ(ctx);
14705 ctx->fmtpos++;
14706 }
14707 if (arg->ch == '*') {
14708 v = unicode_format_getnextarg(ctx);
14709 if (v == NULL)
14710 return -1;
14711 if (!PyLong_Check(v)) {
14712 PyErr_SetString(PyExc_TypeError,
14713 "* wants int");
14714 return -1;
14715 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014716 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014717 if (arg->prec == -1 && PyErr_Occurred())
14718 return -1;
14719 if (arg->prec < 0)
14720 arg->prec = 0;
14721 if (--ctx->fmtcnt >= 0) {
14722 arg->ch = FORMAT_READ(ctx);
14723 ctx->fmtpos++;
14724 }
14725 }
14726 else if (arg->ch >= '0' && arg->ch <= '9') {
14727 arg->prec = arg->ch - '0';
14728 while (--ctx->fmtcnt >= 0) {
14729 arg->ch = FORMAT_READ(ctx);
14730 ctx->fmtpos++;
14731 if (arg->ch < '0' || arg->ch > '9')
14732 break;
14733 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14734 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014735 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014736 return -1;
14737 }
14738 arg->prec = arg->prec*10 + (arg->ch - '0');
14739 }
14740 }
14741 }
14742
14743 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14744 if (ctx->fmtcnt >= 0) {
14745 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14746 if (--ctx->fmtcnt >= 0) {
14747 arg->ch = FORMAT_READ(ctx);
14748 ctx->fmtpos++;
14749 }
14750 }
14751 }
14752 if (ctx->fmtcnt < 0) {
14753 PyErr_SetString(PyExc_ValueError,
14754 "incomplete format");
14755 return -1;
14756 }
14757 return 0;
14758
14759#undef FORMAT_READ
14760}
14761
14762/* Format one argument. Supported conversion specifiers:
14763
14764 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014765 - "i", "d", "u": int or float
14766 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014767 - "e", "E", "f", "F", "g", "G": float
14768 - "c": int or str (1 character)
14769
Victor Stinner8dbd4212012-12-04 09:30:24 +010014770 When possible, the output is written directly into the Unicode writer
14771 (ctx->writer). A string is created when padding is required.
14772
Victor Stinnera47082312012-10-04 02:19:54 +020014773 Return 0 if the argument has been formatted into *p_str,
14774 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014775 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014776static int
14777unicode_format_arg_format(struct unicode_formatter_t *ctx,
14778 struct unicode_format_arg_t *arg,
14779 PyObject **p_str)
14780{
14781 PyObject *v;
14782 _PyUnicodeWriter *writer = &ctx->writer;
14783
14784 if (ctx->fmtcnt == 0)
14785 ctx->writer.overallocate = 0;
14786
Victor Stinnera47082312012-10-04 02:19:54 +020014787 v = unicode_format_getnextarg(ctx);
14788 if (v == NULL)
14789 return -1;
14790
Victor Stinnera47082312012-10-04 02:19:54 +020014791
14792 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014793 case 's':
14794 case 'r':
14795 case 'a':
14796 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14797 /* Fast path */
14798 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14799 return -1;
14800 return 1;
14801 }
14802
14803 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14804 *p_str = v;
14805 Py_INCREF(*p_str);
14806 }
14807 else {
14808 if (arg->ch == 's')
14809 *p_str = PyObject_Str(v);
14810 else if (arg->ch == 'r')
14811 *p_str = PyObject_Repr(v);
14812 else
14813 *p_str = PyObject_ASCII(v);
14814 }
14815 break;
14816
14817 case 'i':
14818 case 'd':
14819 case 'u':
14820 case 'o':
14821 case 'x':
14822 case 'X':
14823 {
14824 int ret = mainformatlong(v, arg, p_str, writer);
14825 if (ret != 0)
14826 return ret;
14827 arg->sign = 1;
14828 break;
14829 }
14830
14831 case 'e':
14832 case 'E':
14833 case 'f':
14834 case 'F':
14835 case 'g':
14836 case 'G':
14837 if (arg->width == -1 && arg->prec == -1
14838 && !(arg->flags & (F_SIGN | F_BLANK)))
14839 {
14840 /* Fast path */
14841 if (formatfloat(v, arg, NULL, writer) == -1)
14842 return -1;
14843 return 1;
14844 }
14845
14846 arg->sign = 1;
14847 if (formatfloat(v, arg, p_str, NULL) == -1)
14848 return -1;
14849 break;
14850
14851 case 'c':
14852 {
14853 Py_UCS4 ch = formatchar(v);
14854 if (ch == (Py_UCS4) -1)
14855 return -1;
14856 if (arg->width == -1 && arg->prec == -1) {
14857 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014858 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014859 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014860 return 1;
14861 }
14862 *p_str = PyUnicode_FromOrdinal(ch);
14863 break;
14864 }
14865
14866 default:
14867 PyErr_Format(PyExc_ValueError,
14868 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014869 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014870 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14871 (int)arg->ch,
14872 ctx->fmtpos - 1);
14873 return -1;
14874 }
14875 if (*p_str == NULL)
14876 return -1;
14877 assert (PyUnicode_Check(*p_str));
14878 return 0;
14879}
14880
14881static int
14882unicode_format_arg_output(struct unicode_formatter_t *ctx,
14883 struct unicode_format_arg_t *arg,
14884 PyObject *str)
14885{
14886 Py_ssize_t len;
14887 enum PyUnicode_Kind kind;
14888 void *pbuf;
14889 Py_ssize_t pindex;
14890 Py_UCS4 signchar;
14891 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014892 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014893 Py_ssize_t sublen;
14894 _PyUnicodeWriter *writer = &ctx->writer;
14895 Py_UCS4 fill;
14896
14897 fill = ' ';
14898 if (arg->sign && arg->flags & F_ZERO)
14899 fill = '0';
14900
14901 if (PyUnicode_READY(str) == -1)
14902 return -1;
14903
14904 len = PyUnicode_GET_LENGTH(str);
14905 if ((arg->width == -1 || arg->width <= len)
14906 && (arg->prec == -1 || arg->prec >= len)
14907 && !(arg->flags & (F_SIGN | F_BLANK)))
14908 {
14909 /* Fast path */
14910 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14911 return -1;
14912 return 0;
14913 }
14914
14915 /* Truncate the string for "s", "r" and "a" formats
14916 if the precision is set */
14917 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14918 if (arg->prec >= 0 && len > arg->prec)
14919 len = arg->prec;
14920 }
14921
14922 /* Adjust sign and width */
14923 kind = PyUnicode_KIND(str);
14924 pbuf = PyUnicode_DATA(str);
14925 pindex = 0;
14926 signchar = '\0';
14927 if (arg->sign) {
14928 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14929 if (ch == '-' || ch == '+') {
14930 signchar = ch;
14931 len--;
14932 pindex++;
14933 }
14934 else if (arg->flags & F_SIGN)
14935 signchar = '+';
14936 else if (arg->flags & F_BLANK)
14937 signchar = ' ';
14938 else
14939 arg->sign = 0;
14940 }
14941 if (arg->width < len)
14942 arg->width = len;
14943
14944 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014945 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014946 if (!(arg->flags & F_LJUST)) {
14947 if (arg->sign) {
14948 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014949 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014950 }
14951 else {
14952 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014953 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014954 }
14955 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014956 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14957 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014958 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014959 }
14960
Victor Stinnera47082312012-10-04 02:19:54 +020014961 buflen = arg->width;
14962 if (arg->sign && len == arg->width)
14963 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014964 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014965 return -1;
14966
14967 /* Write the sign if needed */
14968 if (arg->sign) {
14969 if (fill != ' ') {
14970 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14971 writer->pos += 1;
14972 }
14973 if (arg->width > len)
14974 arg->width--;
14975 }
14976
14977 /* Write the numeric prefix for "x", "X" and "o" formats
14978 if the alternate form is used.
14979 For example, write "0x" for the "%#x" format. */
14980 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14981 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14982 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14983 if (fill != ' ') {
14984 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14985 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14986 writer->pos += 2;
14987 pindex += 2;
14988 }
14989 arg->width -= 2;
14990 if (arg->width < 0)
14991 arg->width = 0;
14992 len -= 2;
14993 }
14994
14995 /* Pad left with the fill character if needed */
14996 if (arg->width > len && !(arg->flags & F_LJUST)) {
14997 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014998 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014999 writer->pos += sublen;
15000 arg->width = len;
15001 }
15002
15003 /* If padding with spaces: write sign if needed and/or numeric prefix if
15004 the alternate form is used */
15005 if (fill == ' ') {
15006 if (arg->sign) {
15007 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15008 writer->pos += 1;
15009 }
15010 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15011 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15012 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15013 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15014 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15015 writer->pos += 2;
15016 pindex += 2;
15017 }
15018 }
15019
15020 /* Write characters */
15021 if (len) {
15022 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15023 str, pindex, len);
15024 writer->pos += len;
15025 }
15026
15027 /* Pad right with the fill character if needed */
15028 if (arg->width > len) {
15029 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015030 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015031 writer->pos += sublen;
15032 }
15033 return 0;
15034}
15035
15036/* Helper of PyUnicode_Format(): format one arg.
15037 Return 0 on success, raise an exception and return -1 on error. */
15038static int
15039unicode_format_arg(struct unicode_formatter_t *ctx)
15040{
15041 struct unicode_format_arg_t arg;
15042 PyObject *str;
15043 int ret;
15044
Victor Stinner8dbd4212012-12-04 09:30:24 +010015045 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015046 if (arg.ch == '%') {
15047 ctx->fmtpos++;
15048 ctx->fmtcnt--;
15049 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15050 return -1;
15051 return 0;
15052 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015053 arg.flags = 0;
15054 arg.width = -1;
15055 arg.prec = -1;
15056 arg.sign = 0;
15057 str = NULL;
15058
Victor Stinnera47082312012-10-04 02:19:54 +020015059 ret = unicode_format_arg_parse(ctx, &arg);
15060 if (ret == -1)
15061 return -1;
15062
15063 ret = unicode_format_arg_format(ctx, &arg, &str);
15064 if (ret == -1)
15065 return -1;
15066
15067 if (ret != 1) {
15068 ret = unicode_format_arg_output(ctx, &arg, str);
15069 Py_DECREF(str);
15070 if (ret == -1)
15071 return -1;
15072 }
15073
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015074 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015075 PyErr_SetString(PyExc_TypeError,
15076 "not all arguments converted during string formatting");
15077 return -1;
15078 }
15079 return 0;
15080}
15081
Alexander Belopolsky40018472011-02-26 01:02:56 +000015082PyObject *
15083PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015084{
Victor Stinnera47082312012-10-04 02:19:54 +020015085 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015086
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015088 PyErr_BadInternalCall();
15089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015090 }
Victor Stinnera47082312012-10-04 02:19:54 +020015091
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015092 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015093 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015094
15095 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015096 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15097 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15098 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15099 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015100
Victor Stinner8f674cc2013-04-17 23:02:17 +020015101 _PyUnicodeWriter_Init(&ctx.writer);
15102 ctx.writer.min_length = ctx.fmtcnt + 100;
15103 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015104
Guido van Rossumd57fd912000-03-10 22:53:23 +000015105 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015106 ctx.arglen = PyTuple_Size(args);
15107 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015108 }
15109 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015110 ctx.arglen = -1;
15111 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015112 }
Victor Stinnera47082312012-10-04 02:19:54 +020015113 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015114 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015115 ctx.dict = args;
15116 else
15117 ctx.dict = NULL;
15118 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015119
Victor Stinnera47082312012-10-04 02:19:54 +020015120 while (--ctx.fmtcnt >= 0) {
15121 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015122 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015123
15124 nonfmtpos = ctx.fmtpos++;
15125 while (ctx.fmtcnt >= 0 &&
15126 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15127 ctx.fmtpos++;
15128 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 }
Victor Stinnera47082312012-10-04 02:19:54 +020015130 if (ctx.fmtcnt < 0) {
15131 ctx.fmtpos--;
15132 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015133 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015134
Victor Stinnercfc4c132013-04-03 01:48:39 +020015135 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15136 nonfmtpos, ctx.fmtpos) < 0)
15137 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 }
15139 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015140 ctx.fmtpos++;
15141 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015142 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015143 }
15144 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015145
Victor Stinnera47082312012-10-04 02:19:54 +020015146 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015147 PyErr_SetString(PyExc_TypeError,
15148 "not all arguments converted during string formatting");
15149 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015150 }
15151
Victor Stinnera47082312012-10-04 02:19:54 +020015152 if (ctx.args_owned) {
15153 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015154 }
Victor Stinnera47082312012-10-04 02:19:54 +020015155 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015156
Benjamin Peterson29060642009-01-31 22:14:21 +000015157 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015158 _PyUnicodeWriter_Dealloc(&ctx.writer);
15159 if (ctx.args_owned) {
15160 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015161 }
15162 return NULL;
15163}
15164
Jeremy Hylton938ace62002-07-17 16:30:39 +000015165static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015166unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15167
Tim Peters6d6c1a32001-08-02 04:15:00 +000015168static PyObject *
15169unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15170{
Benjamin Peterson29060642009-01-31 22:14:21 +000015171 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 static char *kwlist[] = {"object", "encoding", "errors", 0};
15173 char *encoding = NULL;
15174 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015175
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 if (type != &PyUnicode_Type)
15177 return unicode_subtype_new(type, args, kwds);
15178 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015179 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 return NULL;
15181 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015182 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015183 if (encoding == NULL && errors == NULL)
15184 return PyObject_Str(x);
15185 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015186 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015187}
15188
Guido van Rossume023fe02001-08-30 03:12:59 +000015189static PyObject *
15190unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15191{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015192 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015193 Py_ssize_t length, char_size;
15194 int share_wstr, share_utf8;
15195 unsigned int kind;
15196 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015197
Benjamin Peterson14339b62009-01-31 16:36:08 +000015198 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015199
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015200 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015201 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015203 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015204 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015205 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015206 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015207 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015208
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015209 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015210 if (self == NULL) {
15211 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015212 return NULL;
15213 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015214 kind = PyUnicode_KIND(unicode);
15215 length = PyUnicode_GET_LENGTH(unicode);
15216
15217 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015218#ifdef Py_DEBUG
15219 _PyUnicode_HASH(self) = -1;
15220#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015221 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015222#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015223 _PyUnicode_STATE(self).interned = 0;
15224 _PyUnicode_STATE(self).kind = kind;
15225 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015226 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015227 _PyUnicode_STATE(self).ready = 1;
15228 _PyUnicode_WSTR(self) = NULL;
15229 _PyUnicode_UTF8_LENGTH(self) = 0;
15230 _PyUnicode_UTF8(self) = NULL;
15231 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015232 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015233
15234 share_utf8 = 0;
15235 share_wstr = 0;
15236 if (kind == PyUnicode_1BYTE_KIND) {
15237 char_size = 1;
15238 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15239 share_utf8 = 1;
15240 }
15241 else if (kind == PyUnicode_2BYTE_KIND) {
15242 char_size = 2;
15243 if (sizeof(wchar_t) == 2)
15244 share_wstr = 1;
15245 }
15246 else {
15247 assert(kind == PyUnicode_4BYTE_KIND);
15248 char_size = 4;
15249 if (sizeof(wchar_t) == 4)
15250 share_wstr = 1;
15251 }
15252
15253 /* Ensure we won't overflow the length. */
15254 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15255 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015256 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015258 data = PyObject_MALLOC((length + 1) * char_size);
15259 if (data == NULL) {
15260 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015261 goto onError;
15262 }
15263
Victor Stinnerc3c74152011-10-02 20:39:55 +020015264 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015265 if (share_utf8) {
15266 _PyUnicode_UTF8_LENGTH(self) = length;
15267 _PyUnicode_UTF8(self) = data;
15268 }
15269 if (share_wstr) {
15270 _PyUnicode_WSTR_LENGTH(self) = length;
15271 _PyUnicode_WSTR(self) = (wchar_t *)data;
15272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015273
Christian Heimesf051e432016-09-13 20:22:02 +020015274 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015275 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015276 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015277#ifdef Py_DEBUG
15278 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15279#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015280 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015281 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015282
15283onError:
15284 Py_DECREF(unicode);
15285 Py_DECREF(self);
15286 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015287}
15288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015289PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015290"str(object='') -> str\n\
15291str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015292\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015293Create a new string object from the given object. If encoding or\n\
15294errors is specified, then the object must expose a data buffer\n\
15295that will be decoded using the given encoding and error handler.\n\
15296Otherwise, returns the result of object.__str__() (if defined)\n\
15297or repr(object).\n\
15298encoding defaults to sys.getdefaultencoding().\n\
15299errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015300
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015301static PyObject *unicode_iter(PyObject *seq);
15302
Guido van Rossumd57fd912000-03-10 22:53:23 +000015303PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015304 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015305 "str", /* tp_name */
15306 sizeof(PyUnicodeObject), /* tp_basicsize */
15307 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015308 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015309 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015310 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015311 0, /* tp_getattr */
15312 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015313 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015314 unicode_repr, /* tp_repr */
15315 &unicode_as_number, /* tp_as_number */
15316 &unicode_as_sequence, /* tp_as_sequence */
15317 &unicode_as_mapping, /* tp_as_mapping */
15318 (hashfunc) unicode_hash, /* tp_hash*/
15319 0, /* tp_call*/
15320 (reprfunc) unicode_str, /* tp_str */
15321 PyObject_GenericGetAttr, /* tp_getattro */
15322 0, /* tp_setattro */
15323 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015324 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015325 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15326 unicode_doc, /* tp_doc */
15327 0, /* tp_traverse */
15328 0, /* tp_clear */
15329 PyUnicode_RichCompare, /* tp_richcompare */
15330 0, /* tp_weaklistoffset */
15331 unicode_iter, /* tp_iter */
15332 0, /* tp_iternext */
15333 unicode_methods, /* tp_methods */
15334 0, /* tp_members */
15335 0, /* tp_getset */
15336 &PyBaseObject_Type, /* tp_base */
15337 0, /* tp_dict */
15338 0, /* tp_descr_get */
15339 0, /* tp_descr_set */
15340 0, /* tp_dictoffset */
15341 0, /* tp_init */
15342 0, /* tp_alloc */
15343 unicode_new, /* tp_new */
15344 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015345};
15346
15347/* Initialize the Unicode implementation */
15348
Victor Stinner331a6a52019-05-27 16:39:22 +020015349PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015350_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015351{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015352 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015353 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015354 0x000A, /* LINE FEED */
15355 0x000D, /* CARRIAGE RETURN */
15356 0x001C, /* FILE SEPARATOR */
15357 0x001D, /* GROUP SEPARATOR */
15358 0x001E, /* RECORD SEPARATOR */
15359 0x0085, /* NEXT LINE */
15360 0x2028, /* LINE SEPARATOR */
15361 0x2029, /* PARAGRAPH SEPARATOR */
15362 };
15363
Fred Drakee4315f52000-05-09 19:53:39 +000015364 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015365 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015366 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015367 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015368 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015369 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015370
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015371 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015372 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015373 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015374
15375 /* initialize the linebreak bloom filter */
15376 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015377 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015378 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015379
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015380 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015381 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015382 }
15383 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015384 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015385 }
15386 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015387 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015388 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015389 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015390}
15391
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015392
Walter Dörwald16807132007-05-25 13:52:07 +000015393void
15394PyUnicode_InternInPlace(PyObject **p)
15395{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015396 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015397 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015398#ifdef Py_DEBUG
15399 assert(s != NULL);
15400 assert(_PyUnicode_CHECK(s));
15401#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015403 return;
15404#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 /* If it's a subclass, we don't really know what putting
15406 it in the interned dict might do. */
15407 if (!PyUnicode_CheckExact(s))
15408 return;
15409 if (PyUnicode_CHECK_INTERNED(s))
15410 return;
15411 if (interned == NULL) {
15412 interned = PyDict_New();
15413 if (interned == NULL) {
15414 PyErr_Clear(); /* Don't leave an exception */
15415 return;
15416 }
15417 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015419 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015420 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015421 if (t == NULL) {
15422 PyErr_Clear();
15423 return;
15424 }
15425 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015426 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015427 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015428 return;
15429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 /* The two references in interned are not counted by refcnt.
15431 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015432 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015433 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015434}
15435
15436void
15437PyUnicode_InternImmortal(PyObject **p)
15438{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 PyUnicode_InternInPlace(p);
15440 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015441 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 Py_INCREF(*p);
15443 }
Walter Dörwald16807132007-05-25 13:52:07 +000015444}
15445
15446PyObject *
15447PyUnicode_InternFromString(const char *cp)
15448{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 PyObject *s = PyUnicode_FromString(cp);
15450 if (s == NULL)
15451 return NULL;
15452 PyUnicode_InternInPlace(&s);
15453 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015454}
15455
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015456
15457#if defined(WITH_VALGRIND) || defined(__INSURE__)
15458static void
15459unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015460{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015461 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015463 }
15464 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 if (keys == NULL || !PyList_Check(keys)) {
15466 PyErr_Clear();
15467 return;
15468 }
Walter Dörwald16807132007-05-25 13:52:07 +000015469
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015470 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015471 detector, interned unicode strings are not forcibly deallocated;
15472 rather, we give them their stolen references back, and then clear
15473 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015474
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015475 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015476#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015477 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015478 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015479
15480 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015481#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015482 for (Py_ssize_t i = 0; i < n; i++) {
15483 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015484 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015485 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015487 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015488 case SSTATE_INTERNED_IMMORTAL:
15489 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015490#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015491 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015492#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015493 break;
15494 case SSTATE_INTERNED_MORTAL:
15495 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015496#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015497 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015498#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015499 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015500 case SSTATE_NOT_INTERNED:
15501 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015502 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015503 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015505 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015506 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015507#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 fprintf(stderr, "total size of all interned strings: "
15509 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15510 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015511#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 Py_DECREF(keys);
15513 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015514 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015515}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015516#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015517
15518
15519/********************* Unicode Iterator **************************/
15520
15521typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015522 PyObject_HEAD
15523 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015524 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015525} unicodeiterobject;
15526
15527static void
15528unicodeiter_dealloc(unicodeiterobject *it)
15529{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015530 _PyObject_GC_UNTRACK(it);
15531 Py_XDECREF(it->it_seq);
15532 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015533}
15534
15535static int
15536unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15537{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015538 Py_VISIT(it->it_seq);
15539 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015540}
15541
15542static PyObject *
15543unicodeiter_next(unicodeiterobject *it)
15544{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015545 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015546
Benjamin Peterson14339b62009-01-31 16:36:08 +000015547 assert(it != NULL);
15548 seq = it->it_seq;
15549 if (seq == NULL)
15550 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015551 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015553 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15554 int kind = PyUnicode_KIND(seq);
15555 void *data = PyUnicode_DATA(seq);
15556 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15557 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015558 if (item != NULL)
15559 ++it->it_index;
15560 return item;
15561 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015562
Benjamin Peterson14339b62009-01-31 16:36:08 +000015563 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015564 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015565 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015566}
15567
15568static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015569unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015570{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015571 Py_ssize_t len = 0;
15572 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015573 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015574 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015575}
15576
15577PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15578
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015579static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015580unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015581{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015582 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015583 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015584 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015585 it->it_seq, it->it_index);
15586 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015587 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015588 if (u == NULL)
15589 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015590 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015591 }
15592}
15593
15594PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15595
15596static PyObject *
15597unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15598{
15599 Py_ssize_t index = PyLong_AsSsize_t(state);
15600 if (index == -1 && PyErr_Occurred())
15601 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015602 if (it->it_seq != NULL) {
15603 if (index < 0)
15604 index = 0;
15605 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15606 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15607 it->it_index = index;
15608 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015609 Py_RETURN_NONE;
15610}
15611
15612PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15613
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015614static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015615 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015616 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015617 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15618 reduce_doc},
15619 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15620 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015621 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015622};
15623
15624PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15626 "str_iterator", /* tp_name */
15627 sizeof(unicodeiterobject), /* tp_basicsize */
15628 0, /* tp_itemsize */
15629 /* methods */
15630 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015631 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015632 0, /* tp_getattr */
15633 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015634 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015635 0, /* tp_repr */
15636 0, /* tp_as_number */
15637 0, /* tp_as_sequence */
15638 0, /* tp_as_mapping */
15639 0, /* tp_hash */
15640 0, /* tp_call */
15641 0, /* tp_str */
15642 PyObject_GenericGetAttr, /* tp_getattro */
15643 0, /* tp_setattro */
15644 0, /* tp_as_buffer */
15645 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15646 0, /* tp_doc */
15647 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15648 0, /* tp_clear */
15649 0, /* tp_richcompare */
15650 0, /* tp_weaklistoffset */
15651 PyObject_SelfIter, /* tp_iter */
15652 (iternextfunc)unicodeiter_next, /* tp_iternext */
15653 unicodeiter_methods, /* tp_methods */
15654 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015655};
15656
15657static PyObject *
15658unicode_iter(PyObject *seq)
15659{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015660 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015661
Benjamin Peterson14339b62009-01-31 16:36:08 +000015662 if (!PyUnicode_Check(seq)) {
15663 PyErr_BadInternalCall();
15664 return NULL;
15665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015666 if (PyUnicode_READY(seq) == -1)
15667 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015668 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15669 if (it == NULL)
15670 return NULL;
15671 it->it_index = 0;
15672 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015673 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 _PyObject_GC_TRACK(it);
15675 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015676}
15677
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015678
15679size_t
15680Py_UNICODE_strlen(const Py_UNICODE *u)
15681{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015682 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015683}
15684
15685Py_UNICODE*
15686Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15687{
15688 Py_UNICODE *u = s1;
15689 while ((*u++ = *s2++));
15690 return s1;
15691}
15692
15693Py_UNICODE*
15694Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15695{
15696 Py_UNICODE *u = s1;
15697 while ((*u++ = *s2++))
15698 if (n-- == 0)
15699 break;
15700 return s1;
15701}
15702
15703Py_UNICODE*
15704Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15705{
15706 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015707 u1 += wcslen(u1);
15708 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015709 return s1;
15710}
15711
15712int
15713Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15714{
15715 while (*s1 && *s2 && *s1 == *s2)
15716 s1++, s2++;
15717 if (*s1 && *s2)
15718 return (*s1 < *s2) ? -1 : +1;
15719 if (*s1)
15720 return 1;
15721 if (*s2)
15722 return -1;
15723 return 0;
15724}
15725
15726int
15727Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15728{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015729 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015730 for (; n != 0; n--) {
15731 u1 = *s1;
15732 u2 = *s2;
15733 if (u1 != u2)
15734 return (u1 < u2) ? -1 : +1;
15735 if (u1 == '\0')
15736 return 0;
15737 s1++;
15738 s2++;
15739 }
15740 return 0;
15741}
15742
15743Py_UNICODE*
15744Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15745{
15746 const Py_UNICODE *p;
15747 for (p = s; *p; p++)
15748 if (*p == c)
15749 return (Py_UNICODE*)p;
15750 return NULL;
15751}
15752
15753Py_UNICODE*
15754Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15755{
15756 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015757 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015758 while (p != s) {
15759 p--;
15760 if (*p == c)
15761 return (Py_UNICODE*)p;
15762 }
15763 return NULL;
15764}
Victor Stinner331ea922010-08-10 16:37:20 +000015765
Victor Stinner71133ff2010-09-01 23:43:53 +000015766Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015767PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015768{
Victor Stinner577db2c2011-10-11 22:12:48 +020015769 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015770 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015772 if (!PyUnicode_Check(unicode)) {
15773 PyErr_BadArgument();
15774 return NULL;
15775 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015776 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015777 if (u == NULL)
15778 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015779 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015780 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015781 PyErr_NoMemory();
15782 return NULL;
15783 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015784 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015785 size *= sizeof(Py_UNICODE);
15786 copy = PyMem_Malloc(size);
15787 if (copy == NULL) {
15788 PyErr_NoMemory();
15789 return NULL;
15790 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015791 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015792 return copy;
15793}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015794
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015795
Victor Stinner709d23d2019-05-02 14:56:30 -040015796static int
15797encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015798{
Victor Stinner709d23d2019-05-02 14:56:30 -040015799 int res;
15800 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15801 if (res == -2) {
15802 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15803 return -1;
15804 }
15805 if (res < 0) {
15806 PyErr_NoMemory();
15807 return -1;
15808 }
15809 return 0;
15810}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015811
Victor Stinner709d23d2019-05-02 14:56:30 -040015812
15813static int
15814config_get_codec_name(wchar_t **config_encoding)
15815{
15816 char *encoding;
15817 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15818 return -1;
15819 }
15820
15821 PyObject *name_obj = NULL;
15822 PyObject *codec = _PyCodec_Lookup(encoding);
15823 PyMem_RawFree(encoding);
15824
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015825 if (!codec)
15826 goto error;
15827
15828 name_obj = PyObject_GetAttrString(codec, "name");
15829 Py_CLEAR(codec);
15830 if (!name_obj) {
15831 goto error;
15832 }
15833
Victor Stinner709d23d2019-05-02 14:56:30 -040015834 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15835 Py_DECREF(name_obj);
15836 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015837 goto error;
15838 }
15839
Victor Stinner709d23d2019-05-02 14:56:30 -040015840 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15841 if (raw_wname == NULL) {
15842 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015843 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015844 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015845 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015846
15847 PyMem_RawFree(*config_encoding);
15848 *config_encoding = raw_wname;
15849
15850 PyMem_Free(wname);
15851 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015852
15853error:
15854 Py_XDECREF(codec);
15855 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015856 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015857}
15858
15859
Victor Stinner331a6a52019-05-27 16:39:22 +020015860static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015861init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015862{
Victor Stinner709d23d2019-05-02 14:56:30 -040015863 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015864 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015865 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015866 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015867 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015868 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015869 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015870}
15871
15872
Victor Stinner709d23d2019-05-02 14:56:30 -040015873static int
15874init_fs_codec(PyInterpreterState *interp)
15875{
Victor Stinner331a6a52019-05-27 16:39:22 +020015876 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015877
15878 _Py_error_handler error_handler;
15879 error_handler = get_error_handler_wide(config->filesystem_errors);
15880 if (error_handler == _Py_ERROR_UNKNOWN) {
15881 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15882 return -1;
15883 }
15884
15885 char *encoding, *errors;
15886 if (encode_wstr_utf8(config->filesystem_encoding,
15887 &encoding,
15888 "filesystem_encoding") < 0) {
15889 return -1;
15890 }
15891
15892 if (encode_wstr_utf8(config->filesystem_errors,
15893 &errors,
15894 "filesystem_errors") < 0) {
15895 PyMem_RawFree(encoding);
15896 return -1;
15897 }
15898
15899 PyMem_RawFree(interp->fs_codec.encoding);
15900 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015901 /* encoding has been normalized by init_fs_encoding() */
15902 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015903 PyMem_RawFree(interp->fs_codec.errors);
15904 interp->fs_codec.errors = errors;
15905 interp->fs_codec.error_handler = error_handler;
15906
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015907#ifdef _Py_FORCE_UTF8_FS_ENCODING
15908 assert(interp->fs_codec.utf8 == 1);
15909#endif
15910
Victor Stinner709d23d2019-05-02 14:56:30 -040015911 /* At this point, PyUnicode_EncodeFSDefault() and
15912 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15913 the C implementation of the filesystem encoding. */
15914
15915 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15916 global configuration variables. */
15917 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15918 interp->fs_codec.errors) < 0) {
15919 PyErr_NoMemory();
15920 return -1;
15921 }
15922 return 0;
15923}
15924
15925
Victor Stinner331a6a52019-05-27 16:39:22 +020015926static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015927init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015928{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015929 PyInterpreterState *interp = tstate->interp;
15930
Victor Stinner709d23d2019-05-02 14:56:30 -040015931 /* Update the filesystem encoding to the normalized Python codec name.
15932 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15933 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015934 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015935 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015936 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015937 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015938 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015939 }
15940
Victor Stinner709d23d2019-05-02 14:56:30 -040015941 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015942 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015943 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015944 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015945}
15946
15947
Victor Stinner331a6a52019-05-27 16:39:22 +020015948PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015949_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015950{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015951 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015952 if (_PyStatus_EXCEPTION(status)) {
15953 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015954 }
15955
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015956 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015957}
15958
15959
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015960static void
15961_PyUnicode_FiniEncodings(PyThreadState *tstate)
15962{
15963 PyInterpreterState *interp = tstate->interp;
15964 PyMem_RawFree(interp->fs_codec.encoding);
15965 interp->fs_codec.encoding = NULL;
15966 interp->fs_codec.utf8 = 0;
15967 PyMem_RawFree(interp->fs_codec.errors);
15968 interp->fs_codec.errors = NULL;
15969 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15970}
15971
15972
Victor Stinner709d23d2019-05-02 14:56:30 -040015973#ifdef MS_WINDOWS
15974int
15975_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15976{
15977 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015978 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015979
15980 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15981 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15982 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15983 if (encoding == NULL || errors == NULL) {
15984 PyMem_RawFree(encoding);
15985 PyMem_RawFree(errors);
15986 PyErr_NoMemory();
15987 return -1;
15988 }
15989
15990 PyMem_RawFree(config->filesystem_encoding);
15991 config->filesystem_encoding = encoding;
15992 PyMem_RawFree(config->filesystem_errors);
15993 config->filesystem_errors = errors;
15994
15995 return init_fs_codec(interp);
15996}
15997#endif
15998
15999
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016000void
Victor Stinner3d483342019-11-22 12:27:50 +010016001_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016002{
Victor Stinner3d483342019-11-22 12:27:50 +010016003 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016004#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016005 /* Insure++ is a memory analysis tool that aids in discovering
16006 * memory leaks and other memory problems. On Python exit, the
16007 * interned string dictionaries are flagged as being in use at exit
16008 * (which it is). Under normal circumstances, this is fine because
16009 * the memory will be automatically reclaimed by the system. Under
16010 * memory debugging, it's a huge source of useless noise, so we
16011 * trade off slower shutdown for less distraction in the memory
16012 * reports. -baw
16013 */
16014 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016015#endif /* __INSURE__ */
16016
Victor Stinner3d483342019-11-22 12:27:50 +010016017 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016018
Victor Stinner3d483342019-11-22 12:27:50 +010016019 for (Py_ssize_t i = 0; i < 256; i++) {
16020 Py_CLEAR(unicode_latin1[i]);
16021 }
16022 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016023 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016024
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016025 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016026}
16027
16028
Georg Brandl66c221e2010-10-14 07:04:07 +000016029/* A _string module, to export formatter_parser and formatter_field_name_split
16030 to the string.Formatter class implemented in Python. */
16031
16032static PyMethodDef _string_methods[] = {
16033 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16034 METH_O, PyDoc_STR("split the argument as a field name")},
16035 {"formatter_parser", (PyCFunction) formatter_parser,
16036 METH_O, PyDoc_STR("parse the argument as a format string")},
16037 {NULL, NULL}
16038};
16039
16040static struct PyModuleDef _string_module = {
16041 PyModuleDef_HEAD_INIT,
16042 "_string",
16043 PyDoc_STR("string helper module"),
16044 0,
16045 _string_methods,
16046 NULL,
16047 NULL,
16048 NULL,
16049 NULL
16050};
16051
16052PyMODINIT_FUNC
16053PyInit__string(void)
16054{
16055 return PyModule_Create(&_string_module);
16056}
16057
16058
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016059#ifdef __cplusplus
16060}
16061#endif