blob: d4b2c93a8452643ac0055d43e8eec22794e6bcc4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900268static inline void
269_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400270static PyObject *
271unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
272 const char *errors);
273static PyObject *
274unicode_decode_utf8(const char *s, Py_ssize_t size,
275 _Py_error_handler error_handler, const char *errors,
276 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200279static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281/* Single character Unicode strings in the Latin-1 range are being
282 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200283static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284
Christian Heimes190d79e2008-01-30 11:58:22 +0000285/* Fast detection of the most frequent whitespace characters */
286const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000C: * FORM FEED */
292/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 1, 1, 1, 1, 1, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x001C: * FILE SEPARATOR */
296/* case 0x001D: * GROUP SEPARATOR */
297/* case 0x001E: * RECORD SEPARATOR */
298/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 1, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200317static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100319static int unicode_modifiable(PyObject *unicode);
320
Victor Stinnerfe226c02011-10-03 03:52:20 +0200321
Alexander Belopolsky40018472011-02-26 01:02:56 +0000322static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100323_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200324static PyObject *
325_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
326static PyObject *
327_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
328
329static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000330unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100332 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
334
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335static void
336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100338 PyObject *unicode,
339 Py_ssize_t startpos, Py_ssize_t endpos,
340 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000341
Christian Heimes190d79e2008-01-30 11:58:22 +0000342/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200343static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000345/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000346/* 0x000B, * LINE TABULATION */
347/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x001C, * FILE SEPARATOR */
352/* 0x001D, * GROUP SEPARATOR */
353/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 1, 1, 1, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000359
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000368};
369
INADA Naoki3ae20562017-01-16 20:41:20 +0900370static int convert_uc(PyObject *obj, void *addr);
371
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300372#include "clinic/unicodeobject.c.h"
373
Victor Stinner3d4226a2018-08-29 22:21:32 +0200374_Py_error_handler
375_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200376{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200378 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 }
380 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200387 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_OTHER;
399}
400
Victor Stinner709d23d2019-05-02 14:56:30 -0400401
402static _Py_error_handler
403get_error_handler_wide(const wchar_t *errors)
404{
405 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
406 return _Py_ERROR_STRICT;
407 }
408 if (wcscmp(errors, L"surrogateescape") == 0) {
409 return _Py_ERROR_SURROGATEESCAPE;
410 }
411 if (wcscmp(errors, L"replace") == 0) {
412 return _Py_ERROR_REPLACE;
413 }
414 if (wcscmp(errors, L"ignore") == 0) {
415 return _Py_ERROR_IGNORE;
416 }
417 if (wcscmp(errors, L"backslashreplace") == 0) {
418 return _Py_ERROR_BACKSLASHREPLACE;
419 }
420 if (wcscmp(errors, L"surrogatepass") == 0) {
421 return _Py_ERROR_SURROGATEPASS;
422 }
423 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
424 return _Py_ERROR_XMLCHARREFREPLACE;
425 }
426 return _Py_ERROR_OTHER;
427}
428
429
Victor Stinner22eb6892019-06-26 00:51:05 +0200430static inline int
431unicode_check_encoding_errors(const char *encoding, const char *errors)
432{
433 if (encoding == NULL && errors == NULL) {
434 return 0;
435 }
436
437 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
438#ifndef Py_DEBUG
439 /* In release mode, only check in development mode (-X dev) */
440 if (!interp->config.dev_mode) {
441 return 0;
442 }
443#else
444 /* Always check in debug mode */
445#endif
446
447 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
448 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
449 if (!interp->fs_codec.encoding) {
450 return 0;
451 }
452
453 if (encoding != NULL) {
454 PyObject *handler = _PyCodec_Lookup(encoding);
455 if (handler == NULL) {
456 return -1;
457 }
458 Py_DECREF(handler);
459 }
460
461 if (errors != NULL) {
462 PyObject *handler = PyCodec_LookupError(errors);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468 return 0;
469}
470
471
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300472/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
473 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000474Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000475PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000476{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000477#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000478 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000479#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 /* This is actually an illegal character, so it should
481 not be passed to unichr. */
482 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000483#endif
484}
485
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200486int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100487_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200488{
489 PyASCIIObject *ascii;
490 unsigned int kind;
491
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200492 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200493
494 ascii = (PyASCIIObject *)op;
495 kind = ascii->state.kind;
496
Victor Stinnera3b334d2011-10-03 13:53:37 +0200497 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200498 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
499 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200500 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200501 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200502 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Raymond Hettinger0138c4c2019-08-27 09:55:13 -0700503#ifndef NDEBUG
Victor Stinner7f11ad42011-10-04 00:00:20 +0200504 void *data;
Raymond Hettinger0138c4c2019-08-27 09:55:13 -0700505#endif
Victor Stinner910337b2011-10-03 03:20:16 +0200506
Victor Stinnera41463c2011-10-04 01:05:08 +0200507 if (ascii->state.compact == 1) {
Raymond Hettinger0138c4c2019-08-27 09:55:13 -0700508#ifndef NDEBUG
Victor Stinnera41463c2011-10-04 01:05:08 +0200509 data = compact + 1;
Raymond Hettinger0138c4c2019-08-27 09:55:13 -0700510#endif
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200511 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
512 || kind == PyUnicode_2BYTE_KIND
513 || kind == PyUnicode_4BYTE_KIND);
514 _PyObject_ASSERT(op, ascii->state.ascii == 0);
515 _PyObject_ASSERT(op, ascii->state.ready == 1);
516 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100517 }
518 else {
Raymond Hettinger0138c4c2019-08-27 09:55:13 -0700519#ifndef NDEBUG
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
Raymond Hettinger0138c4c2019-08-27 09:55:13 -0700523#endif
Victor Stinnera41463c2011-10-04 01:05:08 +0200524 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200525 _PyObject_ASSERT(op, ascii->length == 0);
526 _PyObject_ASSERT(op, ascii->hash == -1);
527 _PyObject_ASSERT(op, ascii->state.compact == 0);
528 _PyObject_ASSERT(op, ascii->state.ascii == 0);
529 _PyObject_ASSERT(op, ascii->state.ready == 0);
530 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
531 _PyObject_ASSERT(op, ascii->wstr != NULL);
532 _PyObject_ASSERT(op, data == NULL);
533 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200534 }
535 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
537 || kind == PyUnicode_2BYTE_KIND
538 || kind == PyUnicode_4BYTE_KIND);
539 _PyObject_ASSERT(op, ascii->state.compact == 0);
540 _PyObject_ASSERT(op, ascii->state.ready == 1);
541 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 _PyObject_ASSERT(op, compact->utf8 == data);
544 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200545 }
546 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200547 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 }
549 }
550 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200551 if (
552#if SIZEOF_WCHAR_T == 2
553 kind == PyUnicode_2BYTE_KIND
554#else
555 kind == PyUnicode_4BYTE_KIND
556#endif
557 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200558 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200559 _PyObject_ASSERT(op, ascii->wstr == data);
560 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200562 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200564
565 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200566 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200568 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200570
571 /* check that the best kind is used: O(n) operation */
572 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 Py_ssize_t i;
574 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200575 void *data;
576 Py_UCS4 ch;
577
578 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 for (i=0; i < ascii->length; i++)
580 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200581 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200582 if (ch > maxchar)
583 maxchar = ch;
584 }
585 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100586 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200587 _PyObject_ASSERT(op, maxchar >= 128);
588 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100589 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200590 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200591 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200592 }
Victor Stinner77faf692011-11-20 18:56:05 +0100593 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200594 _PyObject_ASSERT(op, maxchar >= 0x100);
595 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100596 }
597 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200598 _PyObject_ASSERT(op, maxchar >= 0x10000);
599 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100600 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200601 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400603 return 1;
604}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200605
Victor Stinner910337b2011-10-03 03:20:16 +0200606
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100607static PyObject*
608unicode_result_wchar(PyObject *unicode)
609{
610#ifndef Py_DEBUG
611 Py_ssize_t len;
612
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100613 len = _PyUnicode_WSTR_LENGTH(unicode);
614 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200616 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100617 }
618
619 if (len == 1) {
620 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100621 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100622 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
623 Py_DECREF(unicode);
624 return latin1_char;
625 }
626 }
627
628 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200629 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630 return NULL;
631 }
632#else
Victor Stinneraa771272012-10-04 02:32:58 +0200633 assert(Py_REFCNT(unicode) == 1);
634
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100635 /* don't make the result ready in debug mode to ensure that the caller
636 makes the string ready before using it */
637 assert(_PyUnicode_CheckConsistency(unicode, 1));
638#endif
639 return unicode;
640}
641
642static PyObject*
643unicode_result_ready(PyObject *unicode)
644{
645 Py_ssize_t length;
646
647 length = PyUnicode_GET_LENGTH(unicode);
648 if (length == 0) {
649 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100650 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200651 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 }
653 return unicode_empty;
654 }
655
656 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200657 void *data = PyUnicode_DATA(unicode);
658 int kind = PyUnicode_KIND(unicode);
659 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 if (ch < 256) {
661 PyObject *latin1_char = unicode_latin1[ch];
662 if (latin1_char != NULL) {
663 if (unicode != latin1_char) {
664 Py_INCREF(latin1_char);
665 Py_DECREF(unicode);
666 }
667 return latin1_char;
668 }
669 else {
670 assert(_PyUnicode_CheckConsistency(unicode, 1));
671 Py_INCREF(unicode);
672 unicode_latin1[ch] = unicode;
673 return unicode;
674 }
675 }
676 }
677
678 assert(_PyUnicode_CheckConsistency(unicode, 1));
679 return unicode;
680}
681
682static PyObject*
683unicode_result(PyObject *unicode)
684{
685 assert(_PyUnicode_CHECK(unicode));
686 if (PyUnicode_IS_READY(unicode))
687 return unicode_result_ready(unicode);
688 else
689 return unicode_result_wchar(unicode);
690}
691
Victor Stinnerc4b49542011-12-11 22:44:26 +0100692static PyObject*
693unicode_result_unchanged(PyObject *unicode)
694{
695 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500696 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100697 return NULL;
698 Py_INCREF(unicode);
699 return unicode;
700 }
701 else
702 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100703 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100704}
705
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200706/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
707 ASCII, Latin1, UTF-8, etc. */
708static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200709backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200710 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
711{
Victor Stinnerad771582015-10-09 12:38:53 +0200712 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200713 Py_UCS4 ch;
714 enum PyUnicode_Kind kind;
715 void *data;
716
717 assert(PyUnicode_IS_READY(unicode));
718 kind = PyUnicode_KIND(unicode);
719 data = PyUnicode_DATA(unicode);
720
721 size = 0;
722 /* determine replacement size */
723 for (i = collstart; i < collend; ++i) {
724 Py_ssize_t incr;
725
726 ch = PyUnicode_READ(kind, data, i);
727 if (ch < 0x100)
728 incr = 2+2;
729 else if (ch < 0x10000)
730 incr = 2+4;
731 else {
732 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200733 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200734 }
735 if (size > PY_SSIZE_T_MAX - incr) {
736 PyErr_SetString(PyExc_OverflowError,
737 "encoded result is too long for a Python string");
738 return NULL;
739 }
740 size += incr;
741 }
742
Victor Stinnerad771582015-10-09 12:38:53 +0200743 str = _PyBytesWriter_Prepare(writer, str, size);
744 if (str == NULL)
745 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200746
747 /* generate replacement */
748 for (i = collstart; i < collend; ++i) {
749 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200750 *str++ = '\\';
751 if (ch >= 0x00010000) {
752 *str++ = 'U';
753 *str++ = Py_hexdigits[(ch>>28)&0xf];
754 *str++ = Py_hexdigits[(ch>>24)&0xf];
755 *str++ = Py_hexdigits[(ch>>20)&0xf];
756 *str++ = Py_hexdigits[(ch>>16)&0xf];
757 *str++ = Py_hexdigits[(ch>>12)&0xf];
758 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200759 }
Victor Stinner797485e2015-10-09 03:17:30 +0200760 else if (ch >= 0x100) {
761 *str++ = 'u';
762 *str++ = Py_hexdigits[(ch>>12)&0xf];
763 *str++ = Py_hexdigits[(ch>>8)&0xf];
764 }
765 else
766 *str++ = 'x';
767 *str++ = Py_hexdigits[(ch>>4)&0xf];
768 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200769 }
770 return str;
771}
772
773/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
774 ASCII, Latin1, UTF-8, etc. */
775static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200776xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
778{
Victor Stinnerad771582015-10-09 12:38:53 +0200779 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200780 Py_UCS4 ch;
781 enum PyUnicode_Kind kind;
782 void *data;
783
784 assert(PyUnicode_IS_READY(unicode));
785 kind = PyUnicode_KIND(unicode);
786 data = PyUnicode_DATA(unicode);
787
788 size = 0;
789 /* determine replacement size */
790 for (i = collstart; i < collend; ++i) {
791 Py_ssize_t incr;
792
793 ch = PyUnicode_READ(kind, data, i);
794 if (ch < 10)
795 incr = 2+1+1;
796 else if (ch < 100)
797 incr = 2+2+1;
798 else if (ch < 1000)
799 incr = 2+3+1;
800 else if (ch < 10000)
801 incr = 2+4+1;
802 else if (ch < 100000)
803 incr = 2+5+1;
804 else if (ch < 1000000)
805 incr = 2+6+1;
806 else {
807 assert(ch <= MAX_UNICODE);
808 incr = 2+7+1;
809 }
810 if (size > PY_SSIZE_T_MAX - incr) {
811 PyErr_SetString(PyExc_OverflowError,
812 "encoded result is too long for a Python string");
813 return NULL;
814 }
815 size += incr;
816 }
817
Victor Stinnerad771582015-10-09 12:38:53 +0200818 str = _PyBytesWriter_Prepare(writer, str, size);
819 if (str == NULL)
820 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200821
822 /* generate replacement */
823 for (i = collstart; i < collend; ++i) {
824 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
825 }
826 return str;
827}
828
Thomas Wouters477c8d52006-05-27 19:21:47 +0000829/* --- Bloom Filters ----------------------------------------------------- */
830
831/* stuff to implement simple "bloom filters" for Unicode characters.
832 to keep things simple, we use a single bitmask, using the least 5
833 bits from each unicode characters as the bit index. */
834
835/* the linebreak mask is set up by Unicode_Init below */
836
Antoine Pitrouf068f942010-01-13 14:19:12 +0000837#if LONG_BIT >= 128
838#define BLOOM_WIDTH 128
839#elif LONG_BIT >= 64
840#define BLOOM_WIDTH 64
841#elif LONG_BIT >= 32
842#define BLOOM_WIDTH 32
843#else
844#error "LONG_BIT is smaller than 32"
845#endif
846
Thomas Wouters477c8d52006-05-27 19:21:47 +0000847#define BLOOM_MASK unsigned long
848
Serhiy Storchaka05997252013-01-26 12:14:02 +0200849static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000850
Antoine Pitrouf068f942010-01-13 14:19:12 +0000851#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000852
Benjamin Peterson29060642009-01-31 22:14:21 +0000853#define BLOOM_LINEBREAK(ch) \
854 ((ch) < 128U ? ascii_linebreak[(ch)] : \
855 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000856
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700857static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000859{
Victor Stinnera85af502013-04-09 21:53:54 +0200860#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
861 do { \
862 TYPE *data = (TYPE *)PTR; \
863 TYPE *end = data + LEN; \
864 Py_UCS4 ch; \
865 for (; data != end; data++) { \
866 ch = *data; \
867 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
868 } \
869 break; \
870 } while (0)
871
Thomas Wouters477c8d52006-05-27 19:21:47 +0000872 /* calculate simple bloom-style bitmask for a given unicode string */
873
Antoine Pitrouf068f942010-01-13 14:19:12 +0000874 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000875
876 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200877 switch (kind) {
878 case PyUnicode_1BYTE_KIND:
879 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
880 break;
881 case PyUnicode_2BYTE_KIND:
882 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
883 break;
884 case PyUnicode_4BYTE_KIND:
885 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
886 break;
887 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700888 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200889 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000890 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200891
892#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000893}
894
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300895static int
896ensure_unicode(PyObject *obj)
897{
898 if (!PyUnicode_Check(obj)) {
899 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200900 "must be str, not %.100s",
901 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300902 return -1;
903 }
904 return PyUnicode_READY(obj);
905}
906
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200907/* Compilation of templated routines */
908
909#include "stringlib/asciilib.h"
910#include "stringlib/fastsearch.h"
911#include "stringlib/partition.h"
912#include "stringlib/split.h"
913#include "stringlib/count.h"
914#include "stringlib/find.h"
915#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200916#include "stringlib/undef.h"
917
918#include "stringlib/ucs1lib.h"
919#include "stringlib/fastsearch.h"
920#include "stringlib/partition.h"
921#include "stringlib/split.h"
922#include "stringlib/count.h"
923#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300924#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200925#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/undef.h"
927
928#include "stringlib/ucs2lib.h"
929#include "stringlib/fastsearch.h"
930#include "stringlib/partition.h"
931#include "stringlib/split.h"
932#include "stringlib/count.h"
933#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300934#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200935#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/undef.h"
937
938#include "stringlib/ucs4lib.h"
939#include "stringlib/fastsearch.h"
940#include "stringlib/partition.h"
941#include "stringlib/split.h"
942#include "stringlib/count.h"
943#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300944#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200945#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/undef.h"
947
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200948#include "stringlib/unicodedefs.h"
949#include "stringlib/fastsearch.h"
950#include "stringlib/count.h"
951#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100952#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200953
Guido van Rossumd57fd912000-03-10 22:53:23 +0000954/* --- Unicode Object ----------------------------------------------------- */
955
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700956static inline Py_ssize_t
957findchar(const void *s, int kind,
958 Py_ssize_t size, Py_UCS4 ch,
959 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200960{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200961 switch (kind) {
962 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200963 if ((Py_UCS1) ch != ch)
964 return -1;
965 if (direction > 0)
966 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
967 else
968 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200969 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200970 if ((Py_UCS2) ch != ch)
971 return -1;
972 if (direction > 0)
973 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
974 else
975 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200976 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200977 if (direction > 0)
978 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
979 else
980 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200981 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700982 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984}
985
Victor Stinnerafffce42012-10-03 23:03:17 +0200986#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000987/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200988 earlier.
989
990 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
991 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
992 invalid character in Unicode 6.0. */
993static void
994unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
995{
996 int kind = PyUnicode_KIND(unicode);
997 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
998 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
999 if (length <= old_length)
1000 return;
1001 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1002}
1003#endif
1004
Victor Stinnerfe226c02011-10-03 03:52:20 +02001005static PyObject*
1006resize_compact(PyObject *unicode, Py_ssize_t length)
1007{
1008 Py_ssize_t char_size;
1009 Py_ssize_t struct_size;
1010 Py_ssize_t new_size;
1011 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001012 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1015#endif
1016
Victor Stinner79891572012-05-03 13:43:07 +02001017 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001019 assert(PyUnicode_IS_COMPACT(unicode));
1020
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001021 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001022 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001023 struct_size = sizeof(PyASCIIObject);
1024 else
1025 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001026 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001027
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1029 PyErr_NoMemory();
1030 return NULL;
1031 }
1032 new_size = (struct_size + (length + 1) * char_size);
1033
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001034 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1035 PyObject_DEL(_PyUnicode_UTF8(unicode));
1036 _PyUnicode_UTF8(unicode) = NULL;
1037 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1038 }
Victor Stinner84def372011-12-11 20:04:56 +01001039 _Py_DEC_REFTOTAL;
1040 _Py_ForgetReference(unicode);
1041
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001042 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001043 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001044 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 PyErr_NoMemory();
1046 return NULL;
1047 }
Victor Stinner84def372011-12-11 20:04:56 +01001048 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001049 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001050
Victor Stinnerfe226c02011-10-03 03:52:20 +02001051 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001052 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001053 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001054 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001055 _PyUnicode_WSTR_LENGTH(unicode) = length;
1056 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001057 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1058 PyObject_DEL(_PyUnicode_WSTR(unicode));
1059 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001060 if (!PyUnicode_IS_ASCII(unicode))
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001062 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001063#ifdef Py_DEBUG
1064 unicode_fill_invalid(unicode, old_length);
1065#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1067 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001068 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 return unicode;
1070}
1071
Alexander Belopolsky40018472011-02-26 01:02:56 +00001072static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001073resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074{
Victor Stinner95663112011-10-04 01:03:50 +02001075 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001076 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001079
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 if (PyUnicode_IS_READY(unicode)) {
1081 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001082 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001084#ifdef Py_DEBUG
1085 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1086#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087
1088 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001089 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001090 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1091 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092
1093 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1094 PyErr_NoMemory();
1095 return -1;
1096 }
1097 new_size = (length + 1) * char_size;
1098
Victor Stinner7a9105a2011-12-12 00:13:42 +01001099 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1100 {
1101 PyObject_DEL(_PyUnicode_UTF8(unicode));
1102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1104 }
1105
Victor Stinnerfe226c02011-10-03 03:52:20 +02001106 data = (PyObject *)PyObject_REALLOC(data, new_size);
1107 if (data == NULL) {
1108 PyErr_NoMemory();
1109 return -1;
1110 }
1111 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001112 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001113 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001114 _PyUnicode_WSTR_LENGTH(unicode) = length;
1115 }
1116 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001117 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001118 _PyUnicode_UTF8_LENGTH(unicode) = length;
1119 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001120 _PyUnicode_LENGTH(unicode) = length;
1121 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001122#ifdef Py_DEBUG
1123 unicode_fill_invalid(unicode, old_length);
1124#endif
Victor Stinner95663112011-10-04 01:03:50 +02001125 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001126 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001127 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
Victor Stinner95663112011-10-04 01:03:50 +02001130 assert(_PyUnicode_WSTR(unicode) != NULL);
1131
1132 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001133 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001134 PyErr_NoMemory();
1135 return -1;
1136 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001137 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001138 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001139 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001140 if (!wstr) {
1141 PyErr_NoMemory();
1142 return -1;
1143 }
1144 _PyUnicode_WSTR(unicode) = wstr;
1145 _PyUnicode_WSTR(unicode)[length] = 0;
1146 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001147 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 return 0;
1149}
1150
Victor Stinnerfe226c02011-10-03 03:52:20 +02001151static PyObject*
1152resize_copy(PyObject *unicode, Py_ssize_t length)
1153{
1154 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001155 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001157
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001158 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001159
1160 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1161 if (copy == NULL)
1162 return NULL;
1163
1164 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001165 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001166 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001167 }
1168 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001169 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001170
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001171 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172 if (w == NULL)
1173 return NULL;
1174 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1175 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001176 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001177 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001178 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001179 }
1180}
1181
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001183 Ux0000 terminated; some code (e.g. new_identifier)
1184 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185
1186 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001187 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189*/
1190
Alexander Belopolsky40018472011-02-26 01:02:56 +00001191static PyUnicodeObject *
1192_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001194 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001195 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196
Thomas Wouters477c8d52006-05-27 19:21:47 +00001197 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 if (length == 0 && unicode_empty != NULL) {
1199 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001200 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201 }
1202
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001203 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001204 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001205 return (PyUnicodeObject *)PyErr_NoMemory();
1206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 if (length < 0) {
1208 PyErr_SetString(PyExc_SystemError,
1209 "Negative size passed to _PyUnicode_New");
1210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 }
1212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001213 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1214 if (unicode == NULL)
1215 return NULL;
1216 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001217
1218 _PyUnicode_WSTR_LENGTH(unicode) = length;
1219 _PyUnicode_HASH(unicode) = -1;
1220 _PyUnicode_STATE(unicode).interned = 0;
1221 _PyUnicode_STATE(unicode).kind = 0;
1222 _PyUnicode_STATE(unicode).compact = 0;
1223 _PyUnicode_STATE(unicode).ready = 0;
1224 _PyUnicode_STATE(unicode).ascii = 0;
1225 _PyUnicode_DATA_ANY(unicode) = NULL;
1226 _PyUnicode_LENGTH(unicode) = 0;
1227 _PyUnicode_UTF8(unicode) = NULL;
1228 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1231 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001232 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001233 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001234 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001235 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001236
Jeremy Hyltond8082792003-09-16 19:41:39 +00001237 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001238 * the caller fails before initializing str -- unicode_resize()
1239 * reads str[0], and the Keep-Alive optimization can keep memory
1240 * allocated for str alive across a call to unicode_dealloc(unicode).
1241 * We don't want unicode_resize to read uninitialized memory in
1242 * that case.
1243 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244 _PyUnicode_WSTR(unicode)[0] = 0;
1245 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001246
Victor Stinner7931d9a2011-11-04 00:22:48 +01001247 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return unicode;
1249}
1250
Victor Stinnerf42dc442011-10-02 23:33:16 +02001251static const char*
1252unicode_kind_name(PyObject *unicode)
1253{
Victor Stinner42dfd712011-10-03 14:41:45 +02001254 /* don't check consistency: unicode_kind_name() is called from
1255 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001256 if (!PyUnicode_IS_COMPACT(unicode))
1257 {
1258 if (!PyUnicode_IS_READY(unicode))
1259 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001260 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001261 {
1262 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001263 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001264 return "legacy ascii";
1265 else
1266 return "legacy latin1";
1267 case PyUnicode_2BYTE_KIND:
1268 return "legacy UCS2";
1269 case PyUnicode_4BYTE_KIND:
1270 return "legacy UCS4";
1271 default:
1272 return "<legacy invalid kind>";
1273 }
1274 }
1275 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001276 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001277 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001278 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279 return "ascii";
1280 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001281 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001285 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001286 default:
1287 return "<invalid compact kind>";
1288 }
1289}
1290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001293char *_PyUnicode_utf8(void *unicode_raw){
1294 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001295 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinnera42de742018-11-22 10:25:22 +01001298void *_PyUnicode_compact_data(void *unicode_raw) {
1299 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001300 return _PyUnicode_COMPACT_DATA(unicode);
1301}
Victor Stinnera42de742018-11-22 10:25:22 +01001302void *_PyUnicode_data(void *unicode_raw) {
1303 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001304 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1306 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1307 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1308 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1309 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1310 return PyUnicode_DATA(unicode);
1311}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001312
1313void
1314_PyUnicode_Dump(PyObject *op)
1315{
1316 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001317 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1318 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1319 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001320
Victor Stinnera849a4b2011-10-03 12:12:11 +02001321 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001322 {
1323 if (ascii->state.ascii)
1324 data = (ascii + 1);
1325 else
1326 data = (compact + 1);
1327 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001328 else
1329 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001330 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1331 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001332
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (ascii->wstr == data)
1334 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001335 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001336
Victor Stinnera3b334d2011-10-03 13:53:37 +02001337 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001338 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001339 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1340 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001341 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001342 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001343 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001345}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
1347
1348PyObject *
1349PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1350{
1351 PyObject *obj;
1352 PyCompactUnicodeObject *unicode;
1353 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001354 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001355 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356 Py_ssize_t char_size;
1357 Py_ssize_t struct_size;
1358
1359 /* Optimization for empty strings */
1360 if (size == 0 && unicode_empty != NULL) {
1361 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001362 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 }
1364
Victor Stinner9e9d6892011-10-04 01:02:02 +02001365 is_ascii = 0;
1366 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 struct_size = sizeof(PyCompactUnicodeObject);
1368 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001369 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 char_size = 1;
1371 is_ascii = 1;
1372 struct_size = sizeof(PyASCIIObject);
1373 }
1374 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001375 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 char_size = 1;
1377 }
1378 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001379 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 char_size = 2;
1381 if (sizeof(wchar_t) == 2)
1382 is_sharing = 1;
1383 }
1384 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001385 if (maxchar > MAX_UNICODE) {
1386 PyErr_SetString(PyExc_SystemError,
1387 "invalid maximum character passed to PyUnicode_New");
1388 return NULL;
1389 }
Victor Stinner8f825062012-04-27 13:55:39 +02001390 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 char_size = 4;
1392 if (sizeof(wchar_t) == 4)
1393 is_sharing = 1;
1394 }
1395
1396 /* Ensure we won't overflow the size. */
1397 if (size < 0) {
1398 PyErr_SetString(PyExc_SystemError,
1399 "Negative size passed to PyUnicode_New");
1400 return NULL;
1401 }
1402 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1403 return PyErr_NoMemory();
1404
1405 /* Duplicated allocation code from _PyObject_New() instead of a call to
1406 * PyObject_New() so we are able to allocate space for the object and
1407 * it's data buffer.
1408 */
1409 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1410 if (obj == NULL)
1411 return PyErr_NoMemory();
1412 obj = PyObject_INIT(obj, &PyUnicode_Type);
1413 if (obj == NULL)
1414 return NULL;
1415
1416 unicode = (PyCompactUnicodeObject *)obj;
1417 if (is_ascii)
1418 data = ((PyASCIIObject*)obj) + 1;
1419 else
1420 data = unicode + 1;
1421 _PyUnicode_LENGTH(unicode) = size;
1422 _PyUnicode_HASH(unicode) = -1;
1423 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001424 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 _PyUnicode_STATE(unicode).compact = 1;
1426 _PyUnicode_STATE(unicode).ready = 1;
1427 _PyUnicode_STATE(unicode).ascii = is_ascii;
1428 if (is_ascii) {
1429 ((char*)data)[size] = 0;
1430 _PyUnicode_WSTR(unicode) = NULL;
1431 }
Victor Stinner8f825062012-04-27 13:55:39 +02001432 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 ((char*)data)[size] = 0;
1434 _PyUnicode_WSTR(unicode) = NULL;
1435 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001437 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 else {
1440 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001441 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001442 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001444 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 ((Py_UCS4*)data)[size] = 0;
1446 if (is_sharing) {
1447 _PyUnicode_WSTR_LENGTH(unicode) = size;
1448 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1449 }
1450 else {
1451 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1452 _PyUnicode_WSTR(unicode) = NULL;
1453 }
1454 }
Victor Stinner8f825062012-04-27 13:55:39 +02001455#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001456 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001457#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001458 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 return obj;
1460}
1461
1462#if SIZEOF_WCHAR_T == 2
1463/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1464 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001465 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466
1467 This function assumes that unicode can hold one more code point than wstr
1468 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001469static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001471 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472{
1473 const wchar_t *iter;
1474 Py_UCS4 *ucs4_out;
1475
Victor Stinner910337b2011-10-03 03:20:16 +02001476 assert(unicode != NULL);
1477 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1479 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1480
1481 for (iter = begin; iter < end; ) {
1482 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1483 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001484 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1485 && (iter+1) < end
1486 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 {
Victor Stinner551ac952011-11-29 22:58:13 +01001488 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 iter += 2;
1490 }
1491 else {
1492 *ucs4_out++ = *iter;
1493 iter++;
1494 }
1495 }
1496 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1497 _PyUnicode_GET_LENGTH(unicode)));
1498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499}
1500#endif
1501
Victor Stinnercd9950f2011-10-02 00:34:53 +02001502static int
Victor Stinner488fa492011-12-12 00:01:39 +01001503unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001504{
Victor Stinner488fa492011-12-12 00:01:39 +01001505 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001506 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001507 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001508 return -1;
1509 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001510 return 0;
1511}
1512
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001513static int
1514_copy_characters(PyObject *to, Py_ssize_t to_start,
1515 PyObject *from, Py_ssize_t from_start,
1516 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001517{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001518 unsigned int from_kind, to_kind;
1519 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520
Victor Stinneree4544c2012-05-09 22:24:08 +02001521 assert(0 <= how_many);
1522 assert(0 <= from_start);
1523 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001524 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001525 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001526 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527
Victor Stinnerd3f08822012-05-29 12:57:52 +02001528 assert(PyUnicode_Check(to));
1529 assert(PyUnicode_IS_READY(to));
1530 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1531
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001532 if (how_many == 0)
1533 return 0;
1534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001536 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001538 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539
Victor Stinnerf1852262012-06-16 16:38:26 +02001540#ifdef Py_DEBUG
1541 if (!check_maxchar
1542 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1543 {
1544 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1545 Py_UCS4 ch;
1546 Py_ssize_t i;
1547 for (i=0; i < how_many; i++) {
1548 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1549 assert(ch <= to_maxchar);
1550 }
1551 }
1552#endif
1553
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001554 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001555 if (check_maxchar
1556 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1557 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001558 /* Writing Latin-1 characters into an ASCII string requires to
1559 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001560 Py_UCS4 max_char;
1561 max_char = ucs1lib_find_max_char(from_data,
1562 (Py_UCS1*)from_data + how_many);
1563 if (max_char >= 128)
1564 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001565 }
Christian Heimesf051e432016-09-13 20:22:02 +02001566 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001567 (char*)from_data + from_kind * from_start,
1568 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001570 else if (from_kind == PyUnicode_1BYTE_KIND
1571 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001572 {
1573 _PyUnicode_CONVERT_BYTES(
1574 Py_UCS1, Py_UCS2,
1575 PyUnicode_1BYTE_DATA(from) + from_start,
1576 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1577 PyUnicode_2BYTE_DATA(to) + to_start
1578 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001579 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001580 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001581 && to_kind == PyUnicode_4BYTE_KIND)
1582 {
1583 _PyUnicode_CONVERT_BYTES(
1584 Py_UCS1, Py_UCS4,
1585 PyUnicode_1BYTE_DATA(from) + from_start,
1586 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1587 PyUnicode_4BYTE_DATA(to) + to_start
1588 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001589 }
1590 else if (from_kind == PyUnicode_2BYTE_KIND
1591 && to_kind == PyUnicode_4BYTE_KIND)
1592 {
1593 _PyUnicode_CONVERT_BYTES(
1594 Py_UCS2, Py_UCS4,
1595 PyUnicode_2BYTE_DATA(from) + from_start,
1596 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1597 PyUnicode_4BYTE_DATA(to) + to_start
1598 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001599 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001600 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001601 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1602
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 if (!check_maxchar) {
1604 if (from_kind == PyUnicode_2BYTE_KIND
1605 && to_kind == PyUnicode_1BYTE_KIND)
1606 {
1607 _PyUnicode_CONVERT_BYTES(
1608 Py_UCS2, Py_UCS1,
1609 PyUnicode_2BYTE_DATA(from) + from_start,
1610 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1611 PyUnicode_1BYTE_DATA(to) + to_start
1612 );
1613 }
1614 else if (from_kind == PyUnicode_4BYTE_KIND
1615 && to_kind == PyUnicode_1BYTE_KIND)
1616 {
1617 _PyUnicode_CONVERT_BYTES(
1618 Py_UCS4, Py_UCS1,
1619 PyUnicode_4BYTE_DATA(from) + from_start,
1620 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1621 PyUnicode_1BYTE_DATA(to) + to_start
1622 );
1623 }
1624 else if (from_kind == PyUnicode_4BYTE_KIND
1625 && to_kind == PyUnicode_2BYTE_KIND)
1626 {
1627 _PyUnicode_CONVERT_BYTES(
1628 Py_UCS4, Py_UCS2,
1629 PyUnicode_4BYTE_DATA(from) + from_start,
1630 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1631 PyUnicode_2BYTE_DATA(to) + to_start
1632 );
1633 }
1634 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001635 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001636 }
1637 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001638 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001639 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001640 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001641 Py_ssize_t i;
1642
Victor Stinnera0702ab2011-09-29 14:14:38 +02001643 for (i=0; i < how_many; i++) {
1644 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001645 if (ch > to_maxchar)
1646 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001647 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1648 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001649 }
1650 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001651 return 0;
1652}
1653
Victor Stinnerd3f08822012-05-29 12:57:52 +02001654void
1655_PyUnicode_FastCopyCharacters(
1656 PyObject *to, Py_ssize_t to_start,
1657 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001658{
1659 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1660}
1661
1662Py_ssize_t
1663PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1664 PyObject *from, Py_ssize_t from_start,
1665 Py_ssize_t how_many)
1666{
1667 int err;
1668
1669 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1670 PyErr_BadInternalCall();
1671 return -1;
1672 }
1673
Benjamin Petersonbac79492012-01-14 13:34:47 -05001674 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001675 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001676 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001677 return -1;
1678
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001679 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001680 PyErr_SetString(PyExc_IndexError, "string index out of range");
1681 return -1;
1682 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001683 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001684 PyErr_SetString(PyExc_IndexError, "string index out of range");
1685 return -1;
1686 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001687 if (how_many < 0) {
1688 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1689 return -1;
1690 }
1691 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001692 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1693 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001694 "Cannot write %zi characters at %zi "
1695 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001696 how_many, to_start, PyUnicode_GET_LENGTH(to));
1697 return -1;
1698 }
1699
1700 if (how_many == 0)
1701 return 0;
1702
Victor Stinner488fa492011-12-12 00:01:39 +01001703 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704 return -1;
1705
1706 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1707 if (err) {
1708 PyErr_Format(PyExc_SystemError,
1709 "Cannot copy %s characters "
1710 "into a string of %s characters",
1711 unicode_kind_name(from),
1712 unicode_kind_name(to));
1713 return -1;
1714 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001715 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716}
1717
Victor Stinner17222162011-09-28 22:15:37 +02001718/* Find the maximum code point and count the number of surrogate pairs so a
1719 correct string length can be computed before converting a string to UCS4.
1720 This function counts single surrogates as a character and not as a pair.
1721
1722 Return 0 on success, or -1 on error. */
1723static int
1724find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1725 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726{
1727 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001728 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729
Victor Stinnerc53be962011-10-02 21:33:54 +02001730 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 *num_surrogates = 0;
1732 *maxchar = 0;
1733
1734 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001736 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1737 && (iter+1) < end
1738 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1739 {
1740 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1741 ++(*num_surrogates);
1742 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 }
1744 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001746 {
1747 ch = *iter;
1748 iter++;
1749 }
1750 if (ch > *maxchar) {
1751 *maxchar = ch;
1752 if (*maxchar > MAX_UNICODE) {
1753 PyErr_Format(PyExc_ValueError,
1754 "character U+%x is not in range [U+0000; U+10ffff]",
1755 ch);
1756 return -1;
1757 }
1758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 }
1760 return 0;
1761}
1762
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001763int
1764_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765{
1766 wchar_t *end;
1767 Py_UCS4 maxchar = 0;
1768 Py_ssize_t num_surrogates;
1769#if SIZEOF_WCHAR_T == 2
1770 Py_ssize_t length_wo_surrogates;
1771#endif
1772
Georg Brandl7597add2011-10-05 16:36:47 +02001773 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001774 strings were created using _PyObject_New() and where no canonical
1775 representation (the str field) has been set yet aka strings
1776 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001777 assert(_PyUnicode_CHECK(unicode));
1778 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001780 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001781 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001782 /* Actually, it should neither be interned nor be anything else: */
1783 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001786 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
1790 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001791 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1792 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 PyErr_NoMemory();
1794 return -1;
1795 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001796 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 _PyUnicode_WSTR(unicode), end,
1798 PyUnicode_1BYTE_DATA(unicode));
1799 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1800 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1801 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1802 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001803 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001804 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001805 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 }
1807 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001808 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001809 _PyUnicode_UTF8(unicode) = NULL;
1810 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 }
1812 PyObject_FREE(_PyUnicode_WSTR(unicode));
1813 _PyUnicode_WSTR(unicode) = NULL;
1814 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1815 }
1816 /* In this case we might have to convert down from 4-byte native
1817 wchar_t to 2-byte unicode. */
1818 else if (maxchar < 65536) {
1819 assert(num_surrogates == 0 &&
1820 "FindMaxCharAndNumSurrogatePairs() messed up");
1821
Victor Stinner506f5922011-09-28 22:34:18 +02001822#if SIZEOF_WCHAR_T == 2
1823 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001824 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001825 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1826 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1827 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001828 _PyUnicode_UTF8(unicode) = NULL;
1829 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001830#else
1831 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001832 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001833 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001834 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001835 PyErr_NoMemory();
1836 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 }
Victor Stinner506f5922011-09-28 22:34:18 +02001838 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1839 _PyUnicode_WSTR(unicode), end,
1840 PyUnicode_2BYTE_DATA(unicode));
1841 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001844 _PyUnicode_UTF8(unicode) = NULL;
1845 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001846 PyObject_FREE(_PyUnicode_WSTR(unicode));
1847 _PyUnicode_WSTR(unicode) = NULL;
1848 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1849#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850 }
1851 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1852 else {
1853#if SIZEOF_WCHAR_T == 2
1854 /* in case the native representation is 2-bytes, we need to allocate a
1855 new normalized 4-byte version. */
1856 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001857 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1858 PyErr_NoMemory();
1859 return -1;
1860 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001861 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1862 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 PyErr_NoMemory();
1864 return -1;
1865 }
1866 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1867 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001868 _PyUnicode_UTF8(unicode) = NULL;
1869 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001870 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1871 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001872 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 PyObject_FREE(_PyUnicode_WSTR(unicode));
1874 _PyUnicode_WSTR(unicode) = NULL;
1875 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1876#else
1877 assert(num_surrogates == 0);
1878
Victor Stinnerc3c74152011-10-02 20:39:55 +02001879 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001881 _PyUnicode_UTF8(unicode) = NULL;
1882 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884#endif
1885 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1886 }
1887 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001888 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 return 0;
1890}
1891
Alexander Belopolsky40018472011-02-26 01:02:56 +00001892static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001893unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001894{
Walter Dörwald16807132007-05-25 13:52:07 +00001895 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001896 case SSTATE_NOT_INTERNED:
1897 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001898
Benjamin Peterson29060642009-01-31 22:14:21 +00001899 case SSTATE_INTERNED_MORTAL:
1900 /* revive dead object temporarily for DelItem */
1901 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001902 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001903 Py_FatalError(
1904 "deletion of interned string failed");
1905 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001906
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 case SSTATE_INTERNED_IMMORTAL:
1908 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001909 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001910
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 default:
1912 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001913 }
1914
Victor Stinner03490912011-10-03 23:45:12 +02001915 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001917 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001918 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001919 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1920 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001922 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923}
1924
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001925#ifdef Py_DEBUG
1926static int
1927unicode_is_singleton(PyObject *unicode)
1928{
1929 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1930 if (unicode == unicode_empty)
1931 return 1;
1932 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1933 {
1934 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1935 if (ch < 256 && unicode_latin1[ch] == unicode)
1936 return 1;
1937 }
1938 return 0;
1939}
1940#endif
1941
Alexander Belopolsky40018472011-02-26 01:02:56 +00001942static int
Victor Stinner488fa492011-12-12 00:01:39 +01001943unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001944{
Victor Stinner488fa492011-12-12 00:01:39 +01001945 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001946 if (Py_REFCNT(unicode) != 1)
1947 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001948 if (_PyUnicode_HASH(unicode) != -1)
1949 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001950 if (PyUnicode_CHECK_INTERNED(unicode))
1951 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001952 if (!PyUnicode_CheckExact(unicode))
1953 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001954#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001955 /* singleton refcount is greater than 1 */
1956 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001957#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001958 return 1;
1959}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001960
Victor Stinnerfe226c02011-10-03 03:52:20 +02001961static int
1962unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1963{
1964 PyObject *unicode;
1965 Py_ssize_t old_length;
1966
1967 assert(p_unicode != NULL);
1968 unicode = *p_unicode;
1969
1970 assert(unicode != NULL);
1971 assert(PyUnicode_Check(unicode));
1972 assert(0 <= length);
1973
Victor Stinner910337b2011-10-03 03:20:16 +02001974 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001975 old_length = PyUnicode_WSTR_LENGTH(unicode);
1976 else
1977 old_length = PyUnicode_GET_LENGTH(unicode);
1978 if (old_length == length)
1979 return 0;
1980
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001981 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001982 _Py_INCREF_UNICODE_EMPTY();
1983 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001984 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001985 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001986 return 0;
1987 }
1988
Victor Stinner488fa492011-12-12 00:01:39 +01001989 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001990 PyObject *copy = resize_copy(unicode, length);
1991 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001993 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001994 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001995 }
1996
Victor Stinnerfe226c02011-10-03 03:52:20 +02001997 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001998 PyObject *new_unicode = resize_compact(unicode, length);
1999 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002000 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002001 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002002 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002003 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002004 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002005}
2006
Alexander Belopolsky40018472011-02-26 01:02:56 +00002007int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002009{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002010 PyObject *unicode;
2011 if (p_unicode == NULL) {
2012 PyErr_BadInternalCall();
2013 return -1;
2014 }
2015 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002016 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017 {
2018 PyErr_BadInternalCall();
2019 return -1;
2020 }
2021 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002022}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002023
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002024/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002025
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002026 WARNING: The function doesn't copy the terminating null character and
2027 doesn't check the maximum character (may write a latin1 character in an
2028 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002029static void
2030unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2031 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002032{
2033 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2034 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002035 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002036
2037 switch (kind) {
2038 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002039 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002040#ifdef Py_DEBUG
2041 if (PyUnicode_IS_ASCII(unicode)) {
2042 Py_UCS4 maxchar = ucs1lib_find_max_char(
2043 (const Py_UCS1*)str,
2044 (const Py_UCS1*)str + len);
2045 assert(maxchar < 128);
2046 }
2047#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002048 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002049 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002050 }
2051 case PyUnicode_2BYTE_KIND: {
2052 Py_UCS2 *start = (Py_UCS2 *)data + index;
2053 Py_UCS2 *ucs2 = start;
2054 assert(index <= PyUnicode_GET_LENGTH(unicode));
2055
Victor Stinner184252a2012-06-16 02:57:41 +02002056 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002057 *ucs2 = (Py_UCS2)*str;
2058
2059 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002060 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002061 }
2062 default: {
2063 Py_UCS4 *start = (Py_UCS4 *)data + index;
2064 Py_UCS4 *ucs4 = start;
2065 assert(kind == PyUnicode_4BYTE_KIND);
2066 assert(index <= PyUnicode_GET_LENGTH(unicode));
2067
Victor Stinner184252a2012-06-16 02:57:41 +02002068 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002069 *ucs4 = (Py_UCS4)*str;
2070
2071 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002072 }
2073 }
2074}
2075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076static PyObject*
2077get_latin1_char(unsigned char ch)
2078{
Victor Stinnera464fc12011-10-02 20:39:30 +02002079 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002081 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 if (!unicode)
2083 return NULL;
2084 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002085 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002086 unicode_latin1[ch] = unicode;
2087 }
2088 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002089 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090}
2091
Victor Stinner985a82a2014-01-03 12:53:47 +01002092static PyObject*
2093unicode_char(Py_UCS4 ch)
2094{
2095 PyObject *unicode;
2096
2097 assert(ch <= MAX_UNICODE);
2098
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002099 if (ch < 256)
2100 return get_latin1_char(ch);
2101
Victor Stinner985a82a2014-01-03 12:53:47 +01002102 unicode = PyUnicode_New(1, ch);
2103 if (unicode == NULL)
2104 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002105
2106 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2107 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002108 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002109 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002110 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2111 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2112 }
2113 assert(_PyUnicode_CheckConsistency(unicode, 1));
2114 return unicode;
2115}
2116
Alexander Belopolsky40018472011-02-26 01:02:56 +00002117PyObject *
2118PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002120 if (u == NULL)
2121 return (PyObject*)_PyUnicode_New(size);
2122
2123 if (size < 0) {
2124 PyErr_BadInternalCall();
2125 return NULL;
2126 }
2127
2128 return PyUnicode_FromWideChar(u, size);
2129}
2130
2131PyObject *
2132PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2133{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002134 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 Py_UCS4 maxchar = 0;
2136 Py_ssize_t num_surrogates;
2137
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002138 if (u == NULL && size != 0) {
2139 PyErr_BadInternalCall();
2140 return NULL;
2141 }
2142
2143 if (size == -1) {
2144 size = wcslen(u);
2145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002147 /* If the Unicode data is known at construction time, we can apply
2148 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002151 if (size == 0)
2152 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 /* Single character Unicode objects in the Latin-1 range are
2155 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002156 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 return get_latin1_char((unsigned char)*u);
2158
2159 /* If not empty and not single character, copy the Unicode data
2160 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002161 if (find_maxchar_surrogates(u, u + size,
2162 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 return NULL;
2164
Victor Stinner8faf8212011-12-08 22:14:11 +01002165 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166 if (!unicode)
2167 return NULL;
2168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 switch (PyUnicode_KIND(unicode)) {
2170 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002171 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2173 break;
2174 case PyUnicode_2BYTE_KIND:
2175#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002176 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002178 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2180#endif
2181 break;
2182 case PyUnicode_4BYTE_KIND:
2183#if SIZEOF_WCHAR_T == 2
2184 /* This is the only case which has to process surrogates, thus
2185 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002186 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187#else
2188 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002189 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190#endif
2191 break;
2192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002193 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002196 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197}
2198
Alexander Belopolsky40018472011-02-26 01:02:56 +00002199PyObject *
2200PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002201{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 if (size < 0) {
2203 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002204 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002205 return NULL;
2206 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002207 if (u != NULL)
2208 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2209 else
2210 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002211}
2212
Alexander Belopolsky40018472011-02-26 01:02:56 +00002213PyObject *
2214PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002215{
2216 size_t size = strlen(u);
2217 if (size > PY_SSIZE_T_MAX) {
2218 PyErr_SetString(PyExc_OverflowError, "input too long");
2219 return NULL;
2220 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002221 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002222}
2223
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002224PyObject *
2225_PyUnicode_FromId(_Py_Identifier *id)
2226{
2227 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002228 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2229 strlen(id->string),
2230 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002231 if (!id->object)
2232 return NULL;
2233 PyUnicode_InternInPlace(&id->object);
2234 assert(!id->next);
2235 id->next = static_strings;
2236 static_strings = id;
2237 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002238 return id->object;
2239}
2240
2241void
2242_PyUnicode_ClearStaticStrings()
2243{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002244 _Py_Identifier *tmp, *s = static_strings;
2245 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002246 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002247 tmp = s->next;
2248 s->next = NULL;
2249 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002250 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002251 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002252}
2253
Benjamin Peterson0df54292012-03-26 14:50:32 -04002254/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002255
Victor Stinnerd3f08822012-05-29 12:57:52 +02002256PyObject*
2257_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002258{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002259 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002260 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002261 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002262#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002263 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002264#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002265 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002266 }
Victor Stinner785938e2011-12-11 20:09:03 +01002267 unicode = PyUnicode_New(size, 127);
2268 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002269 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002270 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2271 assert(_PyUnicode_CheckConsistency(unicode, 1));
2272 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002273}
2274
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002275static Py_UCS4
2276kind_maxchar_limit(unsigned int kind)
2277{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002278 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002279 case PyUnicode_1BYTE_KIND:
2280 return 0x80;
2281 case PyUnicode_2BYTE_KIND:
2282 return 0x100;
2283 case PyUnicode_4BYTE_KIND:
2284 return 0x10000;
2285 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002286 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002287 }
2288}
2289
Victor Stinner702c7342011-10-05 13:50:52 +02002290static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002291_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002294 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002295
Serhiy Storchaka678db842013-01-26 12:16:36 +02002296 if (size == 0)
2297 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002298 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002299 if (size == 1)
2300 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002301
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002302 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002303 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304 if (!res)
2305 return NULL;
2306 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002307 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002308 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002309}
2310
Victor Stinnere57b1c02011-09-28 22:20:48 +02002311static PyObject*
2312_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313{
2314 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002315 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002316
Serhiy Storchaka678db842013-01-26 12:16:36 +02002317 if (size == 0)
2318 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002319 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002320 if (size == 1)
2321 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002322
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002324 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 if (!res)
2326 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002327 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002329 else {
2330 _PyUnicode_CONVERT_BYTES(
2331 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2332 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002333 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 return res;
2335}
2336
Victor Stinnere57b1c02011-09-28 22:20:48 +02002337static PyObject*
2338_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339{
2340 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002341 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002342
Serhiy Storchaka678db842013-01-26 12:16:36 +02002343 if (size == 0)
2344 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002345 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002346 if (size == 1)
2347 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002348
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002349 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002350 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 if (!res)
2352 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002353 if (max_char < 256)
2354 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2355 PyUnicode_1BYTE_DATA(res));
2356 else if (max_char < 0x10000)
2357 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2358 PyUnicode_2BYTE_DATA(res));
2359 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002361 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 return res;
2363}
2364
2365PyObject*
2366PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2367{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002368 if (size < 0) {
2369 PyErr_SetString(PyExc_ValueError, "size must be positive");
2370 return NULL;
2371 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002372 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002374 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002375 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002376 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002378 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002379 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002380 PyErr_SetString(PyExc_SystemError, "invalid kind");
2381 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383}
2384
Victor Stinnerece58de2012-04-23 23:36:38 +02002385Py_UCS4
2386_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2387{
2388 enum PyUnicode_Kind kind;
2389 void *startptr, *endptr;
2390
2391 assert(PyUnicode_IS_READY(unicode));
2392 assert(0 <= start);
2393 assert(end <= PyUnicode_GET_LENGTH(unicode));
2394 assert(start <= end);
2395
2396 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2397 return PyUnicode_MAX_CHAR_VALUE(unicode);
2398
2399 if (start == end)
2400 return 127;
2401
Victor Stinner94d558b2012-04-27 22:26:58 +02002402 if (PyUnicode_IS_ASCII(unicode))
2403 return 127;
2404
Victor Stinnerece58de2012-04-23 23:36:38 +02002405 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002406 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002407 endptr = (char *)startptr + end * kind;
2408 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002409 switch(kind) {
2410 case PyUnicode_1BYTE_KIND:
2411 return ucs1lib_find_max_char(startptr, endptr);
2412 case PyUnicode_2BYTE_KIND:
2413 return ucs2lib_find_max_char(startptr, endptr);
2414 case PyUnicode_4BYTE_KIND:
2415 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002416 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002417 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002418 }
2419}
2420
Victor Stinner25a4b292011-10-06 12:31:55 +02002421/* Ensure that a string uses the most efficient storage, if it is not the
2422 case: create a new string with of the right kind. Write NULL into *p_unicode
2423 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002424static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002425unicode_adjust_maxchar(PyObject **p_unicode)
2426{
2427 PyObject *unicode, *copy;
2428 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002429 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002430 unsigned int kind;
2431
2432 assert(p_unicode != NULL);
2433 unicode = *p_unicode;
2434 assert(PyUnicode_IS_READY(unicode));
2435 if (PyUnicode_IS_ASCII(unicode))
2436 return;
2437
2438 len = PyUnicode_GET_LENGTH(unicode);
2439 kind = PyUnicode_KIND(unicode);
2440 if (kind == PyUnicode_1BYTE_KIND) {
2441 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002442 max_char = ucs1lib_find_max_char(u, u + len);
2443 if (max_char >= 128)
2444 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002445 }
2446 else if (kind == PyUnicode_2BYTE_KIND) {
2447 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002448 max_char = ucs2lib_find_max_char(u, u + len);
2449 if (max_char >= 256)
2450 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002451 }
2452 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002453 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002454 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002455 max_char = ucs4lib_find_max_char(u, u + len);
2456 if (max_char >= 0x10000)
2457 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002458 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002459 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002460 if (copy != NULL)
2461 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002462 Py_DECREF(unicode);
2463 *p_unicode = copy;
2464}
2465
Victor Stinner034f6cf2011-09-30 02:26:44 +02002466PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002467_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002468{
Victor Stinner87af4f22011-11-21 23:03:47 +01002469 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002470 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002471
Victor Stinner034f6cf2011-09-30 02:26:44 +02002472 if (!PyUnicode_Check(unicode)) {
2473 PyErr_BadInternalCall();
2474 return NULL;
2475 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002476 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002478
Victor Stinner87af4f22011-11-21 23:03:47 +01002479 length = PyUnicode_GET_LENGTH(unicode);
2480 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002481 if (!copy)
2482 return NULL;
2483 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2484
Christian Heimesf051e432016-09-13 20:22:02 +02002485 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002486 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002487 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002488 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002489}
2490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491
Victor Stinnerbc603d12011-10-02 01:00:40 +02002492/* Widen Unicode objects to larger buffers. Don't write terminating null
2493 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494
2495void*
2496_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2497{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002498 Py_ssize_t len;
2499 void *result;
2500 unsigned int skind;
2501
Benjamin Petersonbac79492012-01-14 13:34:47 -05002502 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002503 return NULL;
2504
2505 len = PyUnicode_GET_LENGTH(s);
2506 skind = PyUnicode_KIND(s);
2507 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002508 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 return NULL;
2510 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002511 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002512 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002513 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002514 if (!result)
2515 return PyErr_NoMemory();
2516 assert(skind == PyUnicode_1BYTE_KIND);
2517 _PyUnicode_CONVERT_BYTES(
2518 Py_UCS1, Py_UCS2,
2519 PyUnicode_1BYTE_DATA(s),
2520 PyUnicode_1BYTE_DATA(s) + len,
2521 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002524 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002525 if (!result)
2526 return PyErr_NoMemory();
2527 if (skind == PyUnicode_2BYTE_KIND) {
2528 _PyUnicode_CONVERT_BYTES(
2529 Py_UCS2, Py_UCS4,
2530 PyUnicode_2BYTE_DATA(s),
2531 PyUnicode_2BYTE_DATA(s) + len,
2532 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 else {
2535 assert(skind == PyUnicode_1BYTE_KIND);
2536 _PyUnicode_CONVERT_BYTES(
2537 Py_UCS1, Py_UCS4,
2538 PyUnicode_1BYTE_DATA(s),
2539 PyUnicode_1BYTE_DATA(s) + len,
2540 result);
2541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 default:
2544 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 }
Victor Stinner01698042011-10-04 00:04:26 +02002546 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 return NULL;
2548}
2549
2550static Py_UCS4*
2551as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2552 int copy_null)
2553{
2554 int kind;
2555 void *data;
2556 Py_ssize_t len, targetlen;
2557 if (PyUnicode_READY(string) == -1)
2558 return NULL;
2559 kind = PyUnicode_KIND(string);
2560 data = PyUnicode_DATA(string);
2561 len = PyUnicode_GET_LENGTH(string);
2562 targetlen = len;
2563 if (copy_null)
2564 targetlen++;
2565 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002566 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 if (!target) {
2568 PyErr_NoMemory();
2569 return NULL;
2570 }
2571 }
2572 else {
2573 if (targetsize < targetlen) {
2574 PyErr_Format(PyExc_SystemError,
2575 "string is longer than the buffer");
2576 if (copy_null && 0 < targetsize)
2577 target[0] = 0;
2578 return NULL;
2579 }
2580 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002581 if (kind == PyUnicode_1BYTE_KIND) {
2582 Py_UCS1 *start = (Py_UCS1 *) data;
2583 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002585 else if (kind == PyUnicode_2BYTE_KIND) {
2586 Py_UCS2 *start = (Py_UCS2 *) data;
2587 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2588 }
2589 else {
2590 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002591 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 if (copy_null)
2594 target[len] = 0;
2595 return target;
2596}
2597
2598Py_UCS4*
2599PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2600 int copy_null)
2601{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002602 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 PyErr_BadInternalCall();
2604 return NULL;
2605 }
2606 return as_ucs4(string, target, targetsize, copy_null);
2607}
2608
2609Py_UCS4*
2610PyUnicode_AsUCS4Copy(PyObject *string)
2611{
2612 return as_ucs4(string, NULL, 0, 1);
2613}
2614
Victor Stinner15a11362012-10-06 23:48:20 +02002615/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002616 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2617 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2618#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002619
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002620static int
2621unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2622 Py_ssize_t width, Py_ssize_t precision)
2623{
2624 Py_ssize_t length, fill, arglen;
2625 Py_UCS4 maxchar;
2626
2627 if (PyUnicode_READY(str) == -1)
2628 return -1;
2629
2630 length = PyUnicode_GET_LENGTH(str);
2631 if ((precision == -1 || precision >= length)
2632 && width <= length)
2633 return _PyUnicodeWriter_WriteStr(writer, str);
2634
2635 if (precision != -1)
2636 length = Py_MIN(precision, length);
2637
2638 arglen = Py_MAX(length, width);
2639 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2640 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2641 else
2642 maxchar = writer->maxchar;
2643
2644 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2645 return -1;
2646
2647 if (width > length) {
2648 fill = width - length;
2649 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2650 return -1;
2651 writer->pos += fill;
2652 }
2653
2654 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2655 str, 0, length);
2656 writer->pos += length;
2657 return 0;
2658}
2659
2660static int
Victor Stinner998b8062018-09-12 00:23:25 +02002661unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002662 Py_ssize_t width, Py_ssize_t precision)
2663{
2664 /* UTF-8 */
2665 Py_ssize_t length;
2666 PyObject *unicode;
2667 int res;
2668
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002669 if (precision == -1) {
2670 length = strlen(str);
2671 }
2672 else {
2673 length = 0;
2674 while (length < precision && str[length]) {
2675 length++;
2676 }
2677 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002678 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2679 if (unicode == NULL)
2680 return -1;
2681
2682 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2683 Py_DECREF(unicode);
2684 return res;
2685}
2686
Victor Stinner96865452011-03-01 23:44:09 +00002687static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002688unicode_fromformat_arg(_PyUnicodeWriter *writer,
2689 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002690{
Victor Stinnere215d962012-10-06 23:03:36 +02002691 const char *p;
2692 Py_ssize_t len;
2693 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002694 Py_ssize_t width;
2695 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002696 int longflag;
2697 int longlongflag;
2698 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002699 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002700
2701 p = f;
2702 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002703 zeropad = 0;
2704 if (*f == '0') {
2705 zeropad = 1;
2706 f++;
2707 }
Victor Stinner96865452011-03-01 23:44:09 +00002708
2709 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002710 width = -1;
2711 if (Py_ISDIGIT((unsigned)*f)) {
2712 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002713 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002714 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002716 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002718 return NULL;
2719 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002721 f++;
2722 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002723 }
2724 precision = -1;
2725 if (*f == '.') {
2726 f++;
2727 if (Py_ISDIGIT((unsigned)*f)) {
2728 precision = (*f - '0');
2729 f++;
2730 while (Py_ISDIGIT((unsigned)*f)) {
2731 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2732 PyErr_SetString(PyExc_ValueError,
2733 "precision too big");
2734 return NULL;
2735 }
2736 precision = (precision * 10) + (*f - '0');
2737 f++;
2738 }
2739 }
Victor Stinner96865452011-03-01 23:44:09 +00002740 if (*f == '%') {
2741 /* "%.3%s" => f points to "3" */
2742 f--;
2743 }
2744 }
2745 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002746 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002747 f--;
2748 }
Victor Stinner96865452011-03-01 23:44:09 +00002749
2750 /* Handle %ld, %lu, %lld and %llu. */
2751 longflag = 0;
2752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002754 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002755 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002756 longflag = 1;
2757 ++f;
2758 }
Victor Stinner96865452011-03-01 23:44:09 +00002759 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002761 longlongflag = 1;
2762 f += 2;
2763 }
Victor Stinner96865452011-03-01 23:44:09 +00002764 }
2765 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002766 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002767 size_tflag = 1;
2768 ++f;
2769 }
Victor Stinnere215d962012-10-06 23:03:36 +02002770
2771 if (f[1] == '\0')
2772 writer->overallocate = 0;
2773
2774 switch (*f) {
2775 case 'c':
2776 {
2777 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002778 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002779 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002780 "character argument not in range(0x110000)");
2781 return NULL;
2782 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002783 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002784 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002785 break;
2786 }
2787
2788 case 'i':
2789 case 'd':
2790 case 'u':
2791 case 'x':
2792 {
2793 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002794 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002796
2797 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002798 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002799 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002800 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002801 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002802 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002803 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002804 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002805 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002806 va_arg(*vargs, size_t));
2807 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002808 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002809 va_arg(*vargs, unsigned int));
2810 }
2811 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002812 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002813 }
2814 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002815 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002816 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002817 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002818 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002819 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002820 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002821 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002822 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002823 va_arg(*vargs, Py_ssize_t));
2824 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002825 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002826 va_arg(*vargs, int));
2827 }
2828 assert(len >= 0);
2829
Victor Stinnere215d962012-10-06 23:03:36 +02002830 if (precision < len)
2831 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002832
2833 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002834 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2835 return NULL;
2836
Victor Stinnere215d962012-10-06 23:03:36 +02002837 if (width > precision) {
2838 Py_UCS4 fillchar;
2839 fill = width - precision;
2840 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002841 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2842 return NULL;
2843 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002844 }
Victor Stinner15a11362012-10-06 23:48:20 +02002845 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002846 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002847 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2848 return NULL;
2849 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002850 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002851
Victor Stinner4a587072013-11-19 12:54:53 +01002852 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2853 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002854 break;
2855 }
2856
2857 case 'p':
2858 {
2859 char number[MAX_LONG_LONG_CHARS];
2860
2861 len = sprintf(number, "%p", va_arg(*vargs, void*));
2862 assert(len >= 0);
2863
2864 /* %p is ill-defined: ensure leading 0x. */
2865 if (number[1] == 'X')
2866 number[1] = 'x';
2867 else if (number[1] != 'x') {
2868 memmove(number + 2, number,
2869 strlen(number) + 1);
2870 number[0] = '0';
2871 number[1] = 'x';
2872 len += 2;
2873 }
2874
Victor Stinner4a587072013-11-19 12:54:53 +01002875 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002876 return NULL;
2877 break;
2878 }
2879
2880 case 's':
2881 {
2882 /* UTF-8 */
2883 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002884 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002885 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002886 break;
2887 }
2888
2889 case 'U':
2890 {
2891 PyObject *obj = va_arg(*vargs, PyObject *);
2892 assert(obj && _PyUnicode_CHECK(obj));
2893
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002894 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002895 return NULL;
2896 break;
2897 }
2898
2899 case 'V':
2900 {
2901 PyObject *obj = va_arg(*vargs, PyObject *);
2902 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002903 if (obj) {
2904 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002905 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002906 return NULL;
2907 }
2908 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002909 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002910 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002911 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002912 }
2913 break;
2914 }
2915
2916 case 'S':
2917 {
2918 PyObject *obj = va_arg(*vargs, PyObject *);
2919 PyObject *str;
2920 assert(obj);
2921 str = PyObject_Str(obj);
2922 if (!str)
2923 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002924 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002925 Py_DECREF(str);
2926 return NULL;
2927 }
2928 Py_DECREF(str);
2929 break;
2930 }
2931
2932 case 'R':
2933 {
2934 PyObject *obj = va_arg(*vargs, PyObject *);
2935 PyObject *repr;
2936 assert(obj);
2937 repr = PyObject_Repr(obj);
2938 if (!repr)
2939 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002940 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002941 Py_DECREF(repr);
2942 return NULL;
2943 }
2944 Py_DECREF(repr);
2945 break;
2946 }
2947
2948 case 'A':
2949 {
2950 PyObject *obj = va_arg(*vargs, PyObject *);
2951 PyObject *ascii;
2952 assert(obj);
2953 ascii = PyObject_ASCII(obj);
2954 if (!ascii)
2955 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002956 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002957 Py_DECREF(ascii);
2958 return NULL;
2959 }
2960 Py_DECREF(ascii);
2961 break;
2962 }
2963
2964 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002965 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002966 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002967 break;
2968
2969 default:
2970 /* if we stumble upon an unknown formatting code, copy the rest
2971 of the format string to the output string. (we cannot just
2972 skip the code, since there's no way to know what's in the
2973 argument list) */
2974 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002975 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002976 return NULL;
2977 f = p+len;
2978 return f;
2979 }
2980
2981 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002982 return f;
2983}
2984
Walter Dörwaldd2034312007-05-18 16:29:38 +00002985PyObject *
2986PyUnicode_FromFormatV(const char *format, va_list vargs)
2987{
Victor Stinnere215d962012-10-06 23:03:36 +02002988 va_list vargs2;
2989 const char *f;
2990 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002991
Victor Stinner8f674cc2013-04-17 23:02:17 +02002992 _PyUnicodeWriter_Init(&writer);
2993 writer.min_length = strlen(format) + 100;
2994 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002995
Benjamin Peterson0c212142016-09-20 20:39:33 -07002996 // Copy varags to be able to pass a reference to a subfunction.
2997 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002998
2999 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003000 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003001 f = unicode_fromformat_arg(&writer, f, &vargs2);
3002 if (f == NULL)
3003 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003005 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003006 const char *p;
3007 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003008
Victor Stinnere215d962012-10-06 23:03:36 +02003009 p = f;
3010 do
3011 {
3012 if ((unsigned char)*p > 127) {
3013 PyErr_Format(PyExc_ValueError,
3014 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3015 "string, got a non-ASCII byte: 0x%02x",
3016 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003017 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003018 }
3019 p++;
3020 }
3021 while (*p != '\0' && *p != '%');
3022 len = p - f;
3023
3024 if (*p == '\0')
3025 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003026
3027 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003028 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003029
3030 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003032 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003033 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003034 return _PyUnicodeWriter_Finish(&writer);
3035
3036 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003037 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003038 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003039 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003040}
3041
Walter Dörwaldd2034312007-05-18 16:29:38 +00003042PyObject *
3043PyUnicode_FromFormat(const char *format, ...)
3044{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 PyObject* ret;
3046 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003047
3048#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003049 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003051 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003053 ret = PyUnicode_FromFormatV(format, vargs);
3054 va_end(vargs);
3055 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056}
3057
Serhiy Storchakac46db922018-10-23 22:58:24 +03003058static Py_ssize_t
3059unicode_get_widechar_size(PyObject *unicode)
3060{
3061 Py_ssize_t res;
3062
3063 assert(unicode != NULL);
3064 assert(_PyUnicode_CHECK(unicode));
3065
3066 if (_PyUnicode_WSTR(unicode) != NULL) {
3067 return PyUnicode_WSTR_LENGTH(unicode);
3068 }
3069 assert(PyUnicode_IS_READY(unicode));
3070
3071 res = _PyUnicode_LENGTH(unicode);
3072#if SIZEOF_WCHAR_T == 2
3073 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3074 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3075 const Py_UCS4 *end = s + res;
3076 for (; s < end; ++s) {
3077 if (*s > 0xFFFF) {
3078 ++res;
3079 }
3080 }
3081 }
3082#endif
3083 return res;
3084}
3085
3086static void
3087unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3088{
3089 const wchar_t *wstr;
3090
3091 assert(unicode != NULL);
3092 assert(_PyUnicode_CHECK(unicode));
3093
3094 wstr = _PyUnicode_WSTR(unicode);
3095 if (wstr != NULL) {
3096 memcpy(w, wstr, size * sizeof(wchar_t));
3097 return;
3098 }
3099 assert(PyUnicode_IS_READY(unicode));
3100
3101 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3102 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3103 for (; size--; ++s, ++w) {
3104 *w = *s;
3105 }
3106 }
3107 else {
3108#if SIZEOF_WCHAR_T == 4
3109 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3110 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3111 for (; size--; ++s, ++w) {
3112 *w = *s;
3113 }
3114#else
3115 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3116 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3117 for (; size--; ++s, ++w) {
3118 Py_UCS4 ch = *s;
3119 if (ch > 0xFFFF) {
3120 assert(ch <= MAX_UNICODE);
3121 /* encode surrogate pair in this case */
3122 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3123 if (!size--)
3124 break;
3125 *w = Py_UNICODE_LOW_SURROGATE(ch);
3126 }
3127 else {
3128 *w = ch;
3129 }
3130 }
3131#endif
3132 }
3133}
3134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003135#ifdef HAVE_WCHAR_H
3136
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003137/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003138
Victor Stinnerd88d9832011-09-06 02:00:05 +02003139 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003140 character) required to convert the unicode object. Ignore size argument.
3141
Victor Stinnerd88d9832011-09-06 02:00:05 +02003142 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003143 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003144 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003145Py_ssize_t
3146PyUnicode_AsWideChar(PyObject *unicode,
3147 wchar_t *w,
3148 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003149{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003151
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003152 if (unicode == NULL) {
3153 PyErr_BadInternalCall();
3154 return -1;
3155 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003156 if (!PyUnicode_Check(unicode)) {
3157 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003158 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003159 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003160
3161 res = unicode_get_widechar_size(unicode);
3162 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003164 }
3165
3166 if (size > res) {
3167 size = res + 1;
3168 }
3169 else {
3170 res = size;
3171 }
3172 unicode_copy_as_widechar(unicode, w, size);
3173 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003174}
3175
Victor Stinner137c34c2010-09-29 10:25:54 +00003176wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003177PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003178 Py_ssize_t *size)
3179{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003180 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003181 Py_ssize_t buflen;
3182
3183 if (unicode == NULL) {
3184 PyErr_BadInternalCall();
3185 return NULL;
3186 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003187 if (!PyUnicode_Check(unicode)) {
3188 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189 return NULL;
3190 }
3191
Serhiy Storchakac46db922018-10-23 22:58:24 +03003192 buflen = unicode_get_widechar_size(unicode);
3193 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003194 if (buffer == NULL) {
3195 PyErr_NoMemory();
3196 return NULL;
3197 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003198 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3199 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003200 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003201 }
3202 else if (wcslen(buffer) != (size_t)buflen) {
3203 PyMem_FREE(buffer);
3204 PyErr_SetString(PyExc_ValueError,
3205 "embedded null character");
3206 return NULL;
3207 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003208 return buffer;
3209}
3210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003211#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212
Alexander Belopolsky40018472011-02-26 01:02:56 +00003213PyObject *
3214PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003215{
Victor Stinner8faf8212011-12-08 22:14:11 +01003216 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 PyErr_SetString(PyExc_ValueError,
3218 "chr() arg not in range(0x110000)");
3219 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003220 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003221
Victor Stinner985a82a2014-01-03 12:53:47 +01003222 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003223}
3224
Alexander Belopolsky40018472011-02-26 01:02:56 +00003225PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003226PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003227{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003228 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003230 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003231 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003232 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 Py_INCREF(obj);
3234 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003235 }
3236 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 /* For a Unicode subtype that's not a Unicode object,
3238 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003239 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003240 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003241 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003242 "Can't convert '%.100s' object to str implicitly",
3243 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003244 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003245}
3246
Alexander Belopolsky40018472011-02-26 01:02:56 +00003247PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003248PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003249 const char *encoding,
3250 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003251{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003252 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003253 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 PyErr_BadInternalCall();
3257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003259
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003260 /* Decoding bytes objects is the most common case and should be fast */
3261 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003262 if (PyBytes_GET_SIZE(obj) == 0) {
3263 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3264 return NULL;
3265 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003266 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003267 }
3268 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003269 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3270 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003271 }
3272
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003273 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 PyErr_SetString(PyExc_TypeError,
3275 "decoding str is not supported");
3276 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003277 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003278
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003279 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3280 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3281 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003282 "decoding to str: need a bytes-like object, %.80s found",
3283 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003284 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003285 }
Tim Petersced69f82003-09-16 20:30:58 +00003286
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003287 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003288 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003289 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3290 return NULL;
3291 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003292 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003293 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003294
Serhiy Storchaka05997252013-01-26 12:14:02 +02003295 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003297 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298}
3299
Victor Stinnerebe17e02016-10-12 13:57:45 +02003300/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3301 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3302 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003303int
3304_Py_normalize_encoding(const char *encoding,
3305 char *lower,
3306 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003308 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003309 char *l;
3310 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003311 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312
Victor Stinner942889a2016-09-05 15:40:10 -07003313 assert(encoding != NULL);
3314
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003315 e = encoding;
3316 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003317 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003318 punct = 0;
3319 while (1) {
3320 char c = *e;
3321 if (c == 0) {
3322 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003323 }
Victor Stinner942889a2016-09-05 15:40:10 -07003324
3325 if (Py_ISALNUM(c) || c == '.') {
3326 if (punct && l != lower) {
3327 if (l == l_end) {
3328 return 0;
3329 }
3330 *l++ = '_';
3331 }
3332 punct = 0;
3333
3334 if (l == l_end) {
3335 return 0;
3336 }
3337 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003338 }
3339 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003340 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003341 }
Victor Stinner942889a2016-09-05 15:40:10 -07003342
3343 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003344 }
3345 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003346 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003347}
3348
Alexander Belopolsky40018472011-02-26 01:02:56 +00003349PyObject *
3350PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003351 Py_ssize_t size,
3352 const char *encoding,
3353 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003354{
3355 PyObject *buffer = NULL, *unicode;
3356 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003357 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3358
Victor Stinner22eb6892019-06-26 00:51:05 +02003359 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3360 return NULL;
3361 }
3362
Victor Stinnered076ed2019-06-26 01:49:32 +02003363 if (size == 0) {
3364 _Py_RETURN_UNICODE_EMPTY();
3365 }
3366
Victor Stinner942889a2016-09-05 15:40:10 -07003367 if (encoding == NULL) {
3368 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3369 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003370
Fred Drakee4315f52000-05-09 19:53:39 +00003371 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003372 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3373 char *lower = buflower;
3374
3375 /* Fast paths */
3376 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3377 lower += 3;
3378 if (*lower == '_') {
3379 /* Match "utf8" and "utf_8" */
3380 lower++;
3381 }
3382
3383 if (lower[0] == '8' && lower[1] == 0) {
3384 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3385 }
3386 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3387 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3388 }
3389 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3390 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3391 }
3392 }
3393 else {
3394 if (strcmp(lower, "ascii") == 0
3395 || strcmp(lower, "us_ascii") == 0) {
3396 return PyUnicode_DecodeASCII(s, size, errors);
3397 }
Steve Dowercc16be82016-09-08 10:35:16 -07003398 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003399 else if (strcmp(lower, "mbcs") == 0) {
3400 return PyUnicode_DecodeMBCS(s, size, errors);
3401 }
3402 #endif
3403 else if (strcmp(lower, "latin1") == 0
3404 || strcmp(lower, "latin_1") == 0
3405 || strcmp(lower, "iso_8859_1") == 0
3406 || strcmp(lower, "iso8859_1") == 0) {
3407 return PyUnicode_DecodeLatin1(s, size, errors);
3408 }
3409 }
Victor Stinner37296e82010-06-10 13:36:23 +00003410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411
3412 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003413 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003414 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003415 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003416 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 if (buffer == NULL)
3418 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003419 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 if (unicode == NULL)
3421 goto onError;
3422 if (!PyUnicode_Check(unicode)) {
3423 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003424 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003425 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003426 encoding,
3427 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 Py_DECREF(unicode);
3429 goto onError;
3430 }
3431 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003432 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003433
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 Py_XDECREF(buffer);
3436 return NULL;
3437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003443{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003444 if (!PyUnicode_Check(unicode)) {
3445 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003446 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003447 }
3448
Serhiy Storchaka00939072016-10-27 21:05:49 +03003449 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3450 "PyUnicode_AsDecodedObject() is deprecated; "
3451 "use PyCodec_Decode() to decode from str", 1) < 0)
3452 return NULL;
3453
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003454 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456
3457 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003458 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003459}
3460
Alexander Belopolsky40018472011-02-26 01:02:56 +00003461PyObject *
3462PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003463 const char *encoding,
3464 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003465{
3466 PyObject *v;
3467
3468 if (!PyUnicode_Check(unicode)) {
3469 PyErr_BadArgument();
3470 goto onError;
3471 }
3472
Serhiy Storchaka00939072016-10-27 21:05:49 +03003473 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3474 "PyUnicode_AsDecodedUnicode() is deprecated; "
3475 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3476 return NULL;
3477
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003478 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003480
3481 /* Decode via the codec registry */
3482 v = PyCodec_Decode(unicode, encoding, errors);
3483 if (v == NULL)
3484 goto onError;
3485 if (!PyUnicode_Check(v)) {
3486 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003487 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003488 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003489 encoding,
3490 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491 Py_DECREF(v);
3492 goto onError;
3493 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003494 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003495
Benjamin Peterson29060642009-01-31 22:14:21 +00003496 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003497 return NULL;
3498}
3499
Alexander Belopolsky40018472011-02-26 01:02:56 +00003500PyObject *
3501PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003502 Py_ssize_t size,
3503 const char *encoding,
3504 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505{
3506 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003507
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003508 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3512 Py_DECREF(unicode);
3513 return v;
3514}
3515
Alexander Belopolsky40018472011-02-26 01:02:56 +00003516PyObject *
3517PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003518 const char *encoding,
3519 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003520{
3521 PyObject *v;
3522
3523 if (!PyUnicode_Check(unicode)) {
3524 PyErr_BadArgument();
3525 goto onError;
3526 }
3527
Serhiy Storchaka00939072016-10-27 21:05:49 +03003528 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3529 "PyUnicode_AsEncodedObject() is deprecated; "
3530 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3531 "or PyCodec_Encode() for generic encoding", 1) < 0)
3532 return NULL;
3533
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003536
3537 /* Encode via the codec registry */
3538 v = PyCodec_Encode(unicode, encoding, errors);
3539 if (v == NULL)
3540 goto onError;
3541 return v;
3542
Benjamin Peterson29060642009-01-31 22:14:21 +00003543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003544 return NULL;
3545}
3546
Victor Stinner1b579672011-12-17 05:47:23 +01003547
Victor Stinner2cba6b82018-01-10 22:46:15 +01003548static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003549unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003550 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003551{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003552 Py_ssize_t wlen;
3553 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3554 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003555 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003556 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003557
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003558 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003559 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003560 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003561 return NULL;
3562 }
3563
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003564 char *str;
3565 size_t error_pos;
3566 const char *reason;
3567 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003568 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 PyMem_Free(wstr);
3570
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003571 if (res != 0) {
3572 if (res == -2) {
3573 PyObject *exc;
3574 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3575 "locale", unicode,
3576 (Py_ssize_t)error_pos,
3577 (Py_ssize_t)(error_pos+1),
3578 reason);
3579 if (exc != NULL) {
3580 PyCodec_StrictErrors(exc);
3581 Py_DECREF(exc);
3582 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003583 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003584 else if (res == -3) {
3585 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3586 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003587 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003588 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003589 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003590 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003591 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003592
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003593 PyObject *bytes = PyBytes_FromString(str);
3594 PyMem_RawFree(str);
3595 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003596}
3597
Victor Stinnerad158722010-10-27 00:25:46 +00003598PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003599PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3600{
Victor Stinner709d23d2019-05-02 14:56:30 -04003601 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3602 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003603}
3604
3605PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003606PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003607{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003608 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003609#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003610 if (interp->fs_codec.encoding) {
3611 return unicode_encode_utf8(unicode,
3612 interp->fs_codec.error_handler,
3613 interp->fs_codec.errors);
3614 }
3615 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003616 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003617 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003618 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003619 assert(errors != _Py_ERROR_UNKNOWN);
3620 return unicode_encode_utf8(unicode, errors, NULL);
3621 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003622#else
Victor Stinner793b5312011-04-27 00:24:21 +02003623 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3624 cannot use it to encode and decode filenames before it is loaded. Load
3625 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003626 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003627 initialized and the Python codec is loaded.
3628 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003629 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003630 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003631 interp->fs_codec.encoding,
3632 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003633 }
3634 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003635 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003636 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003637 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003638 assert(errors != _Py_ERROR_UNKNOWN);
3639 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003640 }
Victor Stinnerad158722010-10-27 00:25:46 +00003641#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003642}
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644PyObject *
3645PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 const char *encoding,
3647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648{
3649 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003650 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 if (!PyUnicode_Check(unicode)) {
3653 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 }
Fred Drakee4315f52000-05-09 19:53:39 +00003656
Victor Stinner22eb6892019-06-26 00:51:05 +02003657 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3658 return NULL;
3659 }
3660
Victor Stinner942889a2016-09-05 15:40:10 -07003661 if (encoding == NULL) {
3662 return _PyUnicode_AsUTF8String(unicode, errors);
3663 }
3664
Fred Drakee4315f52000-05-09 19:53:39 +00003665 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003666 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3667 char *lower = buflower;
3668
3669 /* Fast paths */
3670 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3671 lower += 3;
3672 if (*lower == '_') {
3673 /* Match "utf8" and "utf_8" */
3674 lower++;
3675 }
3676
3677 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003679 }
3680 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3681 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3682 }
3683 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3684 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3685 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003686 }
Victor Stinner942889a2016-09-05 15:40:10 -07003687 else {
3688 if (strcmp(lower, "ascii") == 0
3689 || strcmp(lower, "us_ascii") == 0) {
3690 return _PyUnicode_AsASCIIString(unicode, errors);
3691 }
Steve Dowercc16be82016-09-08 10:35:16 -07003692#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003693 else if (strcmp(lower, "mbcs") == 0) {
3694 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3695 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003696#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003697 else if (strcmp(lower, "latin1") == 0 ||
3698 strcmp(lower, "latin_1") == 0 ||
3699 strcmp(lower, "iso_8859_1") == 0 ||
3700 strcmp(lower, "iso8859_1") == 0) {
3701 return _PyUnicode_AsLatin1String(unicode, errors);
3702 }
3703 }
Victor Stinner37296e82010-06-10 13:36:23 +00003704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705
3706 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003707 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003709 return NULL;
3710
3711 /* The normal path */
3712 if (PyBytes_Check(v))
3713 return v;
3714
3715 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003716 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003717 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003718 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003719
3720 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003721 "encoder %s returned bytearray instead of bytes; "
3722 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003723 encoding);
3724 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003725 Py_DECREF(v);
3726 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003727 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003728
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003729 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3730 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003731 Py_DECREF(v);
3732 return b;
3733 }
3734
3735 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003736 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003737 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003738 encoding,
3739 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003740 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003741 return NULL;
3742}
3743
Alexander Belopolsky40018472011-02-26 01:02:56 +00003744PyObject *
3745PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003746 const char *encoding,
3747 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003748{
3749 PyObject *v;
3750
3751 if (!PyUnicode_Check(unicode)) {
3752 PyErr_BadArgument();
3753 goto onError;
3754 }
3755
Serhiy Storchaka00939072016-10-27 21:05:49 +03003756 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3757 "PyUnicode_AsEncodedUnicode() is deprecated; "
3758 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3759 return NULL;
3760
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003761 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003763
3764 /* Encode via the codec registry */
3765 v = PyCodec_Encode(unicode, encoding, errors);
3766 if (v == NULL)
3767 goto onError;
3768 if (!PyUnicode_Check(v)) {
3769 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003770 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003771 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003772 encoding,
3773 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003774 Py_DECREF(v);
3775 goto onError;
3776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003778
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 return NULL;
3781}
3782
Victor Stinner2cba6b82018-01-10 22:46:15 +01003783static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003784unicode_decode_locale(const char *str, Py_ssize_t len,
3785 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003786{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003787 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3788 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003789 return NULL;
3790 }
3791
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003792 wchar_t *wstr;
3793 size_t wlen;
3794 const char *reason;
3795 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003796 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 if (res != 0) {
3798 if (res == -2) {
3799 PyObject *exc;
3800 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3801 "locale", str, len,
3802 (Py_ssize_t)wlen,
3803 (Py_ssize_t)(wlen + 1),
3804 reason);
3805 if (exc != NULL) {
3806 PyCodec_StrictErrors(exc);
3807 Py_DECREF(exc);
3808 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003809 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003810 else if (res == -3) {
3811 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3812 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003813 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003814 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003815 }
Victor Stinner2f197072011-12-17 07:08:30 +01003816 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003817 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003818
3819 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3820 PyMem_RawFree(wstr);
3821 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003822}
3823
3824PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003825PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3826 const char *errors)
3827{
Victor Stinner709d23d2019-05-02 14:56:30 -04003828 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3829 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003830}
3831
3832PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003833PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003834{
3835 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003836 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3837 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003838}
3839
3840
3841PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003842PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003843 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003844 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3845}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003846
Christian Heimes5894ba72007-11-04 11:43:14 +00003847PyObject*
3848PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3849{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003850 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003851#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003852 if (interp->fs_codec.encoding) {
3853 return unicode_decode_utf8(s, size,
3854 interp->fs_codec.error_handler,
3855 interp->fs_codec.errors,
3856 NULL);
3857 }
3858 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003859 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003860 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003861 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003862 assert(errors != _Py_ERROR_UNKNOWN);
3863 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3864 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003865#else
Victor Stinner793b5312011-04-27 00:24:21 +02003866 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3867 cannot use it to encode and decode filenames before it is loaded. Load
3868 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003869 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003870 initialized and the Python codec is loaded.
3871 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003872 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003873 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003874 interp->fs_codec.encoding,
3875 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003876 }
3877 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003878 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003879 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003880 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003881 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003882 }
Victor Stinnerad158722010-10-27 00:25:46 +00003883#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003884}
3885
Martin v. Löwis011e8422009-05-05 04:43:17 +00003886
3887int
3888PyUnicode_FSConverter(PyObject* arg, void* addr)
3889{
Brett Cannonec6ce872016-09-06 15:50:29 -07003890 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003891 PyObject *output = NULL;
3892 Py_ssize_t size;
3893 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003894 if (arg == NULL) {
3895 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003896 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003897 return 1;
3898 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003899 path = PyOS_FSPath(arg);
3900 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003901 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003902 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003903 if (PyBytes_Check(path)) {
3904 output = path;
3905 }
3906 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3907 output = PyUnicode_EncodeFSDefault(path);
3908 Py_DECREF(path);
3909 if (!output) {
3910 return 0;
3911 }
3912 assert(PyBytes_Check(output));
3913 }
3914
Victor Stinner0ea2a462010-04-30 00:22:08 +00003915 size = PyBytes_GET_SIZE(output);
3916 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003917 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003918 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003919 Py_DECREF(output);
3920 return 0;
3921 }
3922 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003923 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003924}
3925
3926
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927int
3928PyUnicode_FSDecoder(PyObject* arg, void* addr)
3929{
Brett Cannona5711202016-09-06 19:36:01 -07003930 int is_buffer = 0;
3931 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003933 if (arg == NULL) {
3934 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003935 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936 return 1;
3937 }
Brett Cannona5711202016-09-06 19:36:01 -07003938
3939 is_buffer = PyObject_CheckBuffer(arg);
3940 if (!is_buffer) {
3941 path = PyOS_FSPath(arg);
3942 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003943 return 0;
3944 }
Brett Cannona5711202016-09-06 19:36:01 -07003945 }
3946 else {
3947 path = arg;
3948 Py_INCREF(arg);
3949 }
3950
3951 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003952 output = path;
3953 }
3954 else if (PyBytes_Check(path) || is_buffer) {
3955 PyObject *path_bytes = NULL;
3956
3957 if (!PyBytes_Check(path) &&
3958 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003959 "path should be string, bytes, or os.PathLike, not %.200s",
3960 Py_TYPE(arg)->tp_name)) {
3961 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003962 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003963 }
3964 path_bytes = PyBytes_FromObject(path);
3965 Py_DECREF(path);
3966 if (!path_bytes) {
3967 return 0;
3968 }
3969 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3970 PyBytes_GET_SIZE(path_bytes));
3971 Py_DECREF(path_bytes);
3972 if (!output) {
3973 return 0;
3974 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003975 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003976 else {
3977 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003978 "path should be string, bytes, or os.PathLike, not %.200s",
3979 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003980 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003981 return 0;
3982 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003983 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003984 Py_DECREF(output);
3985 return 0;
3986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003988 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003989 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003990 Py_DECREF(output);
3991 return 0;
3992 }
3993 *(PyObject**)addr = output;
3994 return Py_CLEANUP_SUPPORTED;
3995}
3996
3997
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003998const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004000{
Christian Heimesf3863112007-11-22 07:46:41 +00004001 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004003 if (!PyUnicode_Check(unicode)) {
4004 PyErr_BadArgument();
4005 return NULL;
4006 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004007 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004008 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004010 if (PyUnicode_UTF8(unicode) == NULL) {
4011 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004012 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 if (bytes == NULL)
4014 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4016 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004017 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 Py_DECREF(bytes);
4019 return NULL;
4020 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004021 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004022 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004023 PyBytes_AS_STRING(bytes),
4024 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025 Py_DECREF(bytes);
4026 }
4027
4028 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004029 *psize = PyUnicode_UTF8_LENGTH(unicode);
4030 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004031}
4032
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004033const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4037}
4038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039Py_UNICODE *
4040PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 if (!PyUnicode_Check(unicode)) {
4043 PyErr_BadArgument();
4044 return NULL;
4045 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004046 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4047 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004049 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004050 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051
Serhiy Storchakac46db922018-10-23 22:58:24 +03004052 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4053 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4054 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004055 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004057 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4058 if (w == NULL) {
4059 PyErr_NoMemory();
4060 return NULL;
4061 }
4062 unicode_copy_as_widechar(unicode, w, wlen + 1);
4063 _PyUnicode_WSTR(unicode) = w;
4064 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4065 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004066 }
4067 }
4068 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004069 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004070 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004071}
4072
Alexander Belopolsky40018472011-02-26 01:02:56 +00004073Py_UNICODE *
4074PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004076 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077}
4078
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004079const Py_UNICODE *
4080_PyUnicode_AsUnicode(PyObject *unicode)
4081{
4082 Py_ssize_t size;
4083 const Py_UNICODE *wstr;
4084
4085 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4086 if (wstr && wcslen(wstr) != (size_t)size) {
4087 PyErr_SetString(PyExc_ValueError, "embedded null character");
4088 return NULL;
4089 }
4090 return wstr;
4091}
4092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093
Alexander Belopolsky40018472011-02-26 01:02:56 +00004094Py_ssize_t
4095PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096{
4097 if (!PyUnicode_Check(unicode)) {
4098 PyErr_BadArgument();
4099 goto onError;
4100 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004101 if (_PyUnicode_WSTR(unicode) == NULL) {
4102 if (PyUnicode_AsUnicode(unicode) == NULL)
4103 goto onError;
4104 }
4105 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 return -1;
4109}
4110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111Py_ssize_t
4112PyUnicode_GetLength(PyObject *unicode)
4113{
Victor Stinner07621332012-06-16 04:53:46 +02004114 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 PyErr_BadArgument();
4116 return -1;
4117 }
Victor Stinner07621332012-06-16 04:53:46 +02004118 if (PyUnicode_READY(unicode) == -1)
4119 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 return PyUnicode_GET_LENGTH(unicode);
4121}
4122
4123Py_UCS4
4124PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4125{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004126 void *data;
4127 int kind;
4128
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004129 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004130 PyErr_BadArgument();
4131 return (Py_UCS4)-1;
4132 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004133 if (PyUnicode_READY(unicode) == -1) {
4134 return (Py_UCS4)-1;
4135 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004136 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004137 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 return (Py_UCS4)-1;
4139 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004140 data = PyUnicode_DATA(unicode);
4141 kind = PyUnicode_KIND(unicode);
4142 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004143}
4144
4145int
4146PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4147{
4148 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004149 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004150 return -1;
4151 }
Victor Stinner488fa492011-12-12 00:01:39 +01004152 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004153 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004154 PyErr_SetString(PyExc_IndexError, "string index out of range");
4155 return -1;
4156 }
Victor Stinner488fa492011-12-12 00:01:39 +01004157 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004158 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004159 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4160 PyErr_SetString(PyExc_ValueError, "character out of range");
4161 return -1;
4162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004163 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4164 index, ch);
4165 return 0;
4166}
4167
Alexander Belopolsky40018472011-02-26 01:02:56 +00004168const char *
4169PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004170{
Victor Stinner42cb4622010-09-01 19:39:01 +00004171 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004172}
4173
Victor Stinner554f3f02010-06-16 23:33:54 +00004174/* create or adjust a UnicodeDecodeError */
4175static void
4176make_decode_exception(PyObject **exceptionObject,
4177 const char *encoding,
4178 const char *input, Py_ssize_t length,
4179 Py_ssize_t startpos, Py_ssize_t endpos,
4180 const char *reason)
4181{
4182 if (*exceptionObject == NULL) {
4183 *exceptionObject = PyUnicodeDecodeError_Create(
4184 encoding, input, length, startpos, endpos, reason);
4185 }
4186 else {
4187 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4188 goto onError;
4189 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4190 goto onError;
4191 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4192 goto onError;
4193 }
4194 return;
4195
4196onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004197 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004198}
4199
Steve Dowercc16be82016-09-08 10:35:16 -07004200#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004201static int
4202widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4203{
4204 if (newsize > *size) {
4205 wchar_t *newbuf = *buf;
4206 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4207 PyErr_NoMemory();
4208 return -1;
4209 }
4210 *buf = newbuf;
4211 }
4212 *size = newsize;
4213 return 0;
4214}
4215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216/* error handling callback helper:
4217 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004218 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004219 and adjust various state variables.
4220 return 0 on success, -1 on error
4221*/
4222
Alexander Belopolsky40018472011-02-26 01:02:56 +00004223static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004224unicode_decode_call_errorhandler_wchar(
4225 const char *errors, PyObject **errorHandler,
4226 const char *encoding, const char *reason,
4227 const char **input, const char **inend, Py_ssize_t *startinpos,
4228 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004229 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004231 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232
4233 PyObject *restuple = NULL;
4234 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004235 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004236 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004237 Py_ssize_t requiredsize;
4238 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004239 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 wchar_t *repwstr;
4241 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242
4243 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 *errorHandler = PyCodec_LookupError(errors);
4245 if (*errorHandler == NULL)
4246 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247 }
4248
Victor Stinner554f3f02010-06-16 23:33:54 +00004249 make_decode_exception(exceptionObject,
4250 encoding,
4251 *input, *inend - *input,
4252 *startinpos, *endinpos,
4253 reason);
4254 if (*exceptionObject == NULL)
4255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004257 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004260 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004261 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004264 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266
4267 /* Copy back the bytes variables, which might have been modified by the
4268 callback */
4269 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4270 if (!inputobj)
4271 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004272 *input = PyBytes_AS_STRING(inputobj);
4273 insize = PyBytes_GET_SIZE(inputobj);
4274 *inend = *input + insize;
4275 /* we can DECREF safely, as the exception has another reference,
4276 so the object won't go away. */
4277 Py_DECREF(inputobj);
4278
4279 if (newpos<0)
4280 newpos = insize+newpos;
4281 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004282 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 goto onError;
4284 }
4285
4286 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4287 if (repwstr == NULL)
4288 goto onError;
4289 /* need more space? (at least enough for what we
4290 have+the replacement+the rest of the string (starting
4291 at the new input position), so we won't have to check space
4292 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004293 requiredsize = *outpos;
4294 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4295 goto overflow;
4296 requiredsize += repwlen;
4297 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4298 goto overflow;
4299 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004300 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004302 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004304 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004306 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004307 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004308 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 *endinpos = newpos;
4311 *inptr = *input + newpos;
4312
4313 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004314 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 return 0;
4316
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004317 overflow:
4318 PyErr_SetString(PyExc_OverflowError,
4319 "decoded result is too long for a Python string");
4320
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004321 onError:
4322 Py_XDECREF(restuple);
4323 return -1;
4324}
Steve Dowercc16be82016-09-08 10:35:16 -07004325#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326
4327static int
4328unicode_decode_call_errorhandler_writer(
4329 const char *errors, PyObject **errorHandler,
4330 const char *encoding, const char *reason,
4331 const char **input, const char **inend, Py_ssize_t *startinpos,
4332 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4333 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4334{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004335 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004336
4337 PyObject *restuple = NULL;
4338 PyObject *repunicode = NULL;
4339 Py_ssize_t insize;
4340 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004341 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004342 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004343 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004344 int need_to_grow = 0;
4345 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346
4347 if (*errorHandler == NULL) {
4348 *errorHandler = PyCodec_LookupError(errors);
4349 if (*errorHandler == NULL)
4350 goto onError;
4351 }
4352
4353 make_decode_exception(exceptionObject,
4354 encoding,
4355 *input, *inend - *input,
4356 *startinpos, *endinpos,
4357 reason);
4358 if (*exceptionObject == NULL)
4359 goto onError;
4360
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004361 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 if (restuple == NULL)
4363 goto onError;
4364 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004365 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004366 goto onError;
4367 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004368 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004369 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004370
4371 /* Copy back the bytes variables, which might have been modified by the
4372 callback */
4373 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4374 if (!inputobj)
4375 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004376 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004377 *input = PyBytes_AS_STRING(inputobj);
4378 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004379 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004380 /* we can DECREF safely, as the exception has another reference,
4381 so the object won't go away. */
4382 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004383
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004386 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004387 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004389 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390
Victor Stinner170ca6f2013-04-18 00:25:28 +02004391 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004392 if (replen > 1) {
4393 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004394 need_to_grow = 1;
4395 }
4396 new_inptr = *input + newpos;
4397 if (*inend - new_inptr > remain) {
4398 /* We don't know the decoding algorithm here so we make the worst
4399 assumption that one byte decodes to one unicode character.
4400 If unfortunately one byte could decode to more unicode characters,
4401 the decoder may write out-of-bound then. Is it possible for the
4402 algorithms using this function? */
4403 writer->min_length += *inend - new_inptr - remain;
4404 need_to_grow = 1;
4405 }
4406 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004407 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004408 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004409 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4410 goto onError;
4411 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004413 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004416 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004417
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004419 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425}
4426
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004427/* --- UTF-7 Codec -------------------------------------------------------- */
4428
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429/* See RFC2152 for details. We encode conservatively and decode liberally. */
4430
4431/* Three simple macros defining base-64. */
4432
4433/* Is c a base-64 character? */
4434
4435#define IS_BASE64(c) \
4436 (((c) >= 'A' && (c) <= 'Z') || \
4437 ((c) >= 'a' && (c) <= 'z') || \
4438 ((c) >= '0' && (c) <= '9') || \
4439 (c) == '+' || (c) == '/')
4440
4441/* given that c is a base-64 character, what is its base-64 value? */
4442
4443#define FROM_BASE64(c) \
4444 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4445 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4446 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4447 (c) == '+' ? 62 : 63)
4448
4449/* What is the base-64 character of the bottom 6 bits of n? */
4450
4451#define TO_BASE64(n) \
4452 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4453
4454/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4455 * decoded as itself. We are permissive on decoding; the only ASCII
4456 * byte not decoding to itself is the + which begins a base64
4457 * string. */
4458
4459#define DECODE_DIRECT(c) \
4460 ((c) <= 127 && (c) != '+')
4461
4462/* The UTF-7 encoder treats ASCII characters differently according to
4463 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4464 * the above). See RFC2152. This array identifies these different
4465 * sets:
4466 * 0 : "Set D"
4467 * alphanumeric and '(),-./:?
4468 * 1 : "Set O"
4469 * !"#$%&*;<=>@[]^_`{|}
4470 * 2 : "whitespace"
4471 * ht nl cr sp
4472 * 3 : special (must be base64 encoded)
4473 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4474 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Tim Petersced69f82003-09-16 20:30:58 +00004476static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477char utf7_category[128] = {
4478/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4479 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4480/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4481 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4482/* sp ! " # $ % & ' ( ) * + , - . / */
4483 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4484/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4486/* @ A B C D E F G H I J K L M N O */
4487 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4488/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4490/* ` a b c d e f g h i j k l m n o */
4491 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4492/* p q r s t u v w x y z { | } ~ del */
4493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494};
4495
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496/* ENCODE_DIRECT: this character should be encoded as itself. The
4497 * answer depends on whether we are encoding set O as itself, and also
4498 * on whether we are encoding whitespace as itself. RFC2152 makes it
4499 * clear that the answers to these questions vary between
4500 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004501
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502#define ENCODE_DIRECT(c, directO, directWS) \
4503 ((c) < 128 && (c) > 0 && \
4504 ((utf7_category[(c)] == 0) || \
4505 (directWS && (utf7_category[(c)] == 2)) || \
4506 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507
Alexander Belopolsky40018472011-02-26 01:02:56 +00004508PyObject *
4509PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004510 Py_ssize_t size,
4511 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004513 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4514}
4515
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516/* The decoder. The only state we preserve is our read position,
4517 * i.e. how many characters we have consumed. So if we end in the
4518 * middle of a shift sequence we have to back off the read position
4519 * and the output to the beginning of the sequence, otherwise we lose
4520 * all the shift state (seen bits, number of bits seen, high
4521 * surrogate). */
4522
Alexander Belopolsky40018472011-02-26 01:02:56 +00004523PyObject *
4524PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004525 Py_ssize_t size,
4526 const char *errors,
4527 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004530 Py_ssize_t startinpos;
4531 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534 const char *errmsg = "";
4535 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004536 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 unsigned int base64bits = 0;
4538 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004539 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 PyObject *errorHandler = NULL;
4541 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004543 if (size == 0) {
4544 if (consumed)
4545 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004546 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004547 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004550 _PyUnicodeWriter_Init(&writer);
4551 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004552
4553 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 e = s + size;
4555
4556 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004559 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (inShift) { /* in a base-64 section */
4562 if (IS_BASE64(ch)) { /* consume a base-64 character */
4563 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4564 base64bits += 6;
4565 s++;
4566 if (base64bits >= 16) {
4567 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004568 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004569 base64bits -= 16;
4570 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004571 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 if (surrogate) {
4573 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004574 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4575 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004576 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004577 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004579 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 }
4581 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004582 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004583 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 }
4586 }
Victor Stinner551ac952011-11-29 22:58:13 +01004587 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 /* first surrogate */
4589 surrogate = outCh;
4590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004592 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004593 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 }
4596 }
4597 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 if (base64bits > 0) { /* left-over bits */
4600 if (base64bits >= 6) {
4601 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004602 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 errmsg = "partial character in shift sequence";
4604 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 else {
4607 /* Some bits remain; they should be zero */
4608 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004609 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 errmsg = "non-zero padding bits in shift sequence";
4611 goto utf7Error;
4612 }
4613 }
4614 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004615 if (surrogate && DECODE_DIRECT(ch)) {
4616 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4617 goto onError;
4618 }
4619 surrogate = 0;
4620 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621 /* '-' is absorbed; other terminating
4622 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004623 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 }
4626 }
4627 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004629 s++; /* consume '+' */
4630 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004631 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004632 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004633 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004635 else if (s < e && !IS_BASE64(*s)) {
4636 s++;
4637 errmsg = "ill-formed sequence";
4638 goto utf7Error;
4639 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004641 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004642 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004643 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004645 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
4647 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004648 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004650 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004651 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004652 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 else {
4654 startinpos = s-starts;
4655 s++;
4656 errmsg = "unexpected special character";
4657 goto utf7Error;
4658 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004659 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 errors, &errorHandler,
4664 "utf7", errmsg,
4665 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004666 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 }
4669
Antoine Pitrou244651a2009-05-04 18:56:13 +00004670 /* end of string */
4671
4672 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4673 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004674 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 if (surrogate ||
4676 (base64bits >= 6) ||
4677 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004679 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 errors, &errorHandler,
4681 "utf7", "unterminated shift sequence",
4682 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004683 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 goto onError;
4685 if (s < e)
4686 goto restart;
4687 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004688 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689
4690 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004691 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004692 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004693 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004694 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004695 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004696 writer.kind, writer.data, shiftOutStart);
4697 Py_XDECREF(errorHandler);
4698 Py_XDECREF(exc);
4699 _PyUnicodeWriter_Dealloc(&writer);
4700 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004701 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004702 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 }
4704 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004705 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004706 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004707 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709 Py_XDECREF(errorHandler);
4710 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004711 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004712
Benjamin Peterson29060642009-01-31 22:14:21 +00004713 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 Py_XDECREF(errorHandler);
4715 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004716 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717 return NULL;
4718}
4719
4720
Alexander Belopolsky40018472011-02-26 01:02:56 +00004721PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004722_PyUnicode_EncodeUTF7(PyObject *str,
4723 int base64SetO,
4724 int base64WhiteSpace,
4725 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004727 int kind;
4728 void *data;
4729 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004730 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004732 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 unsigned int base64bits = 0;
4734 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735 char * out;
4736 char * start;
4737
Benjamin Petersonbac79492012-01-14 13:34:47 -05004738 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004739 return NULL;
4740 kind = PyUnicode_KIND(str);
4741 data = PyUnicode_DATA(str);
4742 len = PyUnicode_GET_LENGTH(str);
4743
4744 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004746
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004747 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004748 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004749 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004750 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751 if (v == NULL)
4752 return NULL;
4753
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004754 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004755 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004756 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004757
Antoine Pitrou244651a2009-05-04 18:56:13 +00004758 if (inShift) {
4759 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4760 /* shifting out */
4761 if (base64bits) { /* output remaining bits */
4762 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4763 base64buffer = 0;
4764 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
4766 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 /* Characters not in the BASE64 set implicitly unshift the sequence
4768 so no '-' is required, except if the character is itself a '-' */
4769 if (IS_BASE64(ch) || ch == '-') {
4770 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 *out++ = (char) ch;
4773 }
4774 else {
4775 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004776 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004777 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 else { /* not in a shift sequence */
4779 if (ch == '+') {
4780 *out++ = '+';
4781 *out++ = '-';
4782 }
4783 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4784 *out++ = (char) ch;
4785 }
4786 else {
4787 *out++ = '+';
4788 inShift = 1;
4789 goto encode_char;
4790 }
4791 }
4792 continue;
4793encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004795 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004796
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 /* code first surrogate */
4798 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004799 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 while (base64bits >= 6) {
4801 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4802 base64bits -= 6;
4803 }
4804 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004805 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 base64bits += 16;
4808 base64buffer = (base64buffer << 16) | ch;
4809 while (base64bits >= 6) {
4810 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4811 base64bits -= 6;
4812 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004813 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004814 if (base64bits)
4815 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4816 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004817 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004818 if (_PyBytes_Resize(&v, out - start) < 0)
4819 return NULL;
4820 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004821}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004822PyObject *
4823PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4824 Py_ssize_t size,
4825 int base64SetO,
4826 int base64WhiteSpace,
4827 const char *errors)
4828{
4829 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004830 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004831 if (tmp == NULL)
4832 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004833 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004834 base64WhiteSpace, errors);
4835 Py_DECREF(tmp);
4836 return result;
4837}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004838
Antoine Pitrou244651a2009-05-04 18:56:13 +00004839#undef IS_BASE64
4840#undef FROM_BASE64
4841#undef TO_BASE64
4842#undef DECODE_DIRECT
4843#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845/* --- UTF-8 Codec -------------------------------------------------------- */
4846
Alexander Belopolsky40018472011-02-26 01:02:56 +00004847PyObject *
4848PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004849 Py_ssize_t size,
4850 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851{
Walter Dörwald69652032004-09-07 20:24:22 +00004852 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4853}
4854
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855#include "stringlib/asciilib.h"
4856#include "stringlib/codecs.h"
4857#include "stringlib/undef.h"
4858
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004859#include "stringlib/ucs1lib.h"
4860#include "stringlib/codecs.h"
4861#include "stringlib/undef.h"
4862
4863#include "stringlib/ucs2lib.h"
4864#include "stringlib/codecs.h"
4865#include "stringlib/undef.h"
4866
4867#include "stringlib/ucs4lib.h"
4868#include "stringlib/codecs.h"
4869#include "stringlib/undef.h"
4870
Antoine Pitrouab868312009-01-10 15:40:25 +00004871/* Mask to quickly check whether a C 'long' contains a
4872 non-ASCII, UTF8-encoded char. */
4873#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004874# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004875#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004876# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004877#else
4878# error C 'long' size should be either 4 or 8!
4879#endif
4880
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004881static Py_ssize_t
4882ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004883{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004884 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004885 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004887 /*
4888 * Issue #17237: m68k is a bit different from most architectures in
4889 * that objects do not use "natural alignment" - for example, int and
4890 * long are only aligned at 2-byte boundaries. Therefore the assert()
4891 * won't work; also, tests have shown that skipping the "optimised
4892 * version" will even speed up m68k.
4893 */
4894#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004895#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004896 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4897 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004898 /* Fast path, see in STRINGLIB(utf8_decode) for
4899 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004900 /* Help allocation */
4901 const char *_p = p;
4902 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 while (_p < aligned_end) {
4904 unsigned long value = *(const unsigned long *) _p;
4905 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004906 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 *((unsigned long *)q) = value;
4908 _p += SIZEOF_LONG;
4909 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004910 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 p = _p;
4912 while (p < end) {
4913 if ((unsigned char)*p & 0x80)
4914 break;
4915 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004920#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 while (p < end) {
4922 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4923 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004924 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004925 /* Help allocation */
4926 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 while (_p < aligned_end) {
4928 unsigned long value = *(unsigned long *) _p;
4929 if (value & ASCII_CHAR_MASK)
4930 break;
4931 _p += SIZEOF_LONG;
4932 }
4933 p = _p;
4934 if (_p == end)
4935 break;
4936 }
4937 if ((unsigned char)*p & 0x80)
4938 break;
4939 ++p;
4940 }
4941 memcpy(dest, start, p - start);
4942 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943}
Antoine Pitrouab868312009-01-10 15:40:25 +00004944
Victor Stinner709d23d2019-05-02 14:56:30 -04004945static PyObject *
4946unicode_decode_utf8(const char *s, Py_ssize_t size,
4947 _Py_error_handler error_handler, const char *errors,
4948 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004949{
Victor Stinner785938e2011-12-11 20:09:03 +01004950 if (size == 0) {
4951 if (consumed)
4952 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004953 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004954 }
4955
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4957 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004958 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 *consumed = 1;
4960 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004961 }
4962
Inada Naoki770847a2019-06-24 12:30:24 +09004963 const char *starts = s;
4964 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004965
Inada Naoki770847a2019-06-24 12:30:24 +09004966 // fast path: try ASCII string.
4967 PyObject *u = PyUnicode_New(size, 127);
4968 if (u == NULL) {
4969 return NULL;
4970 }
4971 s += ascii_decode(s, end, PyUnicode_DATA(u));
4972 if (s == end) {
4973 return u;
4974 }
4975
4976 // Use _PyUnicodeWriter after fast path is failed.
4977 _PyUnicodeWriter writer;
4978 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4979 writer.pos = s - starts;
4980
4981 Py_ssize_t startinpos, endinpos;
4982 const char *errmsg = "";
4983 PyObject *error_handler_obj = NULL;
4984 PyObject *exc = NULL;
4985
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 while (s < end) {
4987 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004989
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004990 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 if (PyUnicode_IS_ASCII(writer.buffer))
4992 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004994 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004997 } else {
4998 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 }
5001
5002 switch (ch) {
5003 case 0:
5004 if (s == end || consumed)
5005 goto End;
5006 errmsg = "unexpected end of data";
5007 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005008 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 break;
5010 case 1:
5011 errmsg = "invalid start byte";
5012 startinpos = s - starts;
5013 endinpos = startinpos + 1;
5014 break;
5015 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005016 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5017 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5018 {
5019 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005020 goto End;
5021 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005022 /* fall through */
5023 case 3:
5024 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 errmsg = "invalid continuation byte";
5026 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005027 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005028 break;
5029 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005030 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005031 goto onError;
5032 continue;
5033 }
5034
Victor Stinner1d65d912015-10-05 13:43:50 +02005035 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005036 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005037
5038 switch (error_handler) {
5039 case _Py_ERROR_IGNORE:
5040 s += (endinpos - startinpos);
5041 break;
5042
5043 case _Py_ERROR_REPLACE:
5044 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5045 goto onError;
5046 s += (endinpos - startinpos);
5047 break;
5048
5049 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005050 {
5051 Py_ssize_t i;
5052
Victor Stinner1d65d912015-10-05 13:43:50 +02005053 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5054 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005055 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005056 ch = (Py_UCS4)(unsigned char)(starts[i]);
5057 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5058 ch + 0xdc00);
5059 writer.pos++;
5060 }
5061 s += (endinpos - startinpos);
5062 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005063 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005064
5065 default:
5066 if (unicode_decode_call_errorhandler_writer(
5067 errors, &error_handler_obj,
5068 "utf-8", errmsg,
5069 &starts, &end, &startinpos, &endinpos, &exc, &s,
5070 &writer))
5071 goto onError;
5072 }
Victor Stinner785938e2011-12-11 20:09:03 +01005073 }
5074
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 if (consumed)
5077 *consumed = s - starts;
5078
Victor Stinner1d65d912015-10-05 13:43:50 +02005079 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005081 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082
5083onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005084 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005086 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005088}
5089
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005090
Victor Stinner709d23d2019-05-02 14:56:30 -04005091PyObject *
5092PyUnicode_DecodeUTF8Stateful(const char *s,
5093 Py_ssize_t size,
5094 const char *errors,
5095 Py_ssize_t *consumed)
5096{
5097 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5098}
5099
5100
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005101/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5102 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005103
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 On success, write a pointer to a newly allocated wide character string into
5105 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5106 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005107
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005108 On memory allocation failure, return -1.
5109
5110 On decoding error (if surrogateescape is zero), return -2. If wlen is
5111 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5112 is not NULL, write the decoding error message into *reason. */
5113int
5114_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005115 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005117 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005118 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 wchar_t *unicode;
5120 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121
Victor Stinner3d4226a2018-08-29 22:21:32 +02005122 int surrogateescape = 0;
5123 int surrogatepass = 0;
5124 switch (errors)
5125 {
5126 case _Py_ERROR_STRICT:
5127 break;
5128 case _Py_ERROR_SURROGATEESCAPE:
5129 surrogateescape = 1;
5130 break;
5131 case _Py_ERROR_SURROGATEPASS:
5132 surrogatepass = 1;
5133 break;
5134 default:
5135 return -3;
5136 }
5137
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138 /* Note: size will always be longer than the resulting Unicode
5139 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005140 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005141 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005142 }
5143
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005144 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005145 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005147 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148
5149 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005154#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005156#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005157 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005158#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 if (ch > 0xFF) {
5160#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005161 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005162#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005163 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005164 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5166 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5167#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005168 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005169 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005170 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005171 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005172 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005173
5174 if (surrogateescape) {
5175 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5176 }
5177 else {
5178 /* Is it a valid three-byte code? */
5179 if (surrogatepass
5180 && (e - s) >= 3
5181 && (s[0] & 0xf0) == 0xe0
5182 && (s[1] & 0xc0) == 0x80
5183 && (s[2] & 0xc0) == 0x80)
5184 {
5185 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5186 s += 3;
5187 unicode[outpos++] = ch;
5188 }
5189 else {
5190 PyMem_RawFree(unicode );
5191 if (reason != NULL) {
5192 switch (ch) {
5193 case 0:
5194 *reason = "unexpected end of data";
5195 break;
5196 case 1:
5197 *reason = "invalid start byte";
5198 break;
5199 /* 2, 3, 4 */
5200 default:
5201 *reason = "invalid continuation byte";
5202 break;
5203 }
5204 }
5205 if (wlen != NULL) {
5206 *wlen = s - orig_s;
5207 }
5208 return -2;
5209 }
5210 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005211 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005212 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005213 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005214 if (wlen) {
5215 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005216 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005217 *wstr = unicode;
5218 return 0;
5219}
5220
Victor Stinner5f9cf232019-03-19 01:46:25 +01005221
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005222wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005223_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5224 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005225{
5226 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005227 int res = _Py_DecodeUTF8Ex(arg, arglen,
5228 &wstr, wlen,
5229 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005230 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005231 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5232 assert(res != -3);
5233 if (wlen) {
5234 *wlen = (size_t)res;
5235 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005236 return NULL;
5237 }
5238 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005239}
5240
Antoine Pitrouab868312009-01-10 15:40:25 +00005241
Victor Stinnere47e6982017-12-21 15:45:16 +01005242/* UTF-8 encoder using the surrogateescape error handler .
5243
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005244 On success, return 0 and write the newly allocated character string (use
5245 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005246
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005247 On encoding failure, return -2 and write the position of the invalid
5248 surrogate character into *error_pos (if error_pos is set) and the decoding
5249 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005250
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005251 On memory allocation failure, return -1. */
5252int
5253_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005254 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005255{
5256 const Py_ssize_t max_char_size = 4;
5257 Py_ssize_t len = wcslen(text);
5258
5259 assert(len >= 0);
5260
Victor Stinner3d4226a2018-08-29 22:21:32 +02005261 int surrogateescape = 0;
5262 int surrogatepass = 0;
5263 switch (errors)
5264 {
5265 case _Py_ERROR_STRICT:
5266 break;
5267 case _Py_ERROR_SURROGATEESCAPE:
5268 surrogateescape = 1;
5269 break;
5270 case _Py_ERROR_SURROGATEPASS:
5271 surrogatepass = 1;
5272 break;
5273 default:
5274 return -3;
5275 }
5276
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005277 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5278 return -1;
5279 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005280 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005281 if (raw_malloc) {
5282 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005283 }
5284 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005285 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005286 }
5287 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005288 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005289 }
5290
5291 char *p = bytes;
5292 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005293 for (i = 0; i < len; ) {
5294 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005295 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005296 i++;
5297#if Py_UNICODE_SIZE == 2
5298 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5299 && i < len
5300 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5301 {
5302 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5303 i++;
5304 }
5305#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005306
5307 if (ch < 0x80) {
5308 /* Encode ASCII */
5309 *p++ = (char) ch;
5310
5311 }
5312 else if (ch < 0x0800) {
5313 /* Encode Latin-1 */
5314 *p++ = (char)(0xc0 | (ch >> 6));
5315 *p++ = (char)(0x80 | (ch & 0x3f));
5316 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005317 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005318 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005319 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005320 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005321 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005322 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005323 if (reason != NULL) {
5324 *reason = "encoding error";
5325 }
5326 if (raw_malloc) {
5327 PyMem_RawFree(bytes);
5328 }
5329 else {
5330 PyMem_Free(bytes);
5331 }
5332 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005333 }
5334 *p++ = (char)(ch & 0xff);
5335 }
5336 else if (ch < 0x10000) {
5337 *p++ = (char)(0xe0 | (ch >> 12));
5338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5339 *p++ = (char)(0x80 | (ch & 0x3f));
5340 }
5341 else { /* ch >= 0x10000 */
5342 assert(ch <= MAX_UNICODE);
5343 /* Encode UCS4 Unicode ordinals */
5344 *p++ = (char)(0xf0 | (ch >> 18));
5345 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5346 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5347 *p++ = (char)(0x80 | (ch & 0x3f));
5348 }
5349 }
5350 *p++ = '\0';
5351
5352 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005353 char *bytes2;
5354 if (raw_malloc) {
5355 bytes2 = PyMem_RawRealloc(bytes, final_size);
5356 }
5357 else {
5358 bytes2 = PyMem_Realloc(bytes, final_size);
5359 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005360 if (bytes2 == NULL) {
5361 if (error_pos != NULL) {
5362 *error_pos = (size_t)-1;
5363 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005364 if (raw_malloc) {
5365 PyMem_RawFree(bytes);
5366 }
5367 else {
5368 PyMem_Free(bytes);
5369 }
5370 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005371 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005372 *str = bytes2;
5373 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005374}
5375
5376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005377/* Primary internal function which creates utf8 encoded bytes objects.
5378
5379 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005380 and allocate exactly as much space needed at the end. Else allocate the
5381 maximum possible needed (4 result bytes per Unicode character), and return
5382 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005383*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005384static PyObject *
5385unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5386 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387{
Victor Stinner6099a032011-12-18 14:22:26 +01005388 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005389 void *data;
5390 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005392 if (!PyUnicode_Check(unicode)) {
5393 PyErr_BadArgument();
5394 return NULL;
5395 }
5396
5397 if (PyUnicode_READY(unicode) == -1)
5398 return NULL;
5399
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005400 if (PyUnicode_UTF8(unicode))
5401 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5402 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005403
5404 kind = PyUnicode_KIND(unicode);
5405 data = PyUnicode_DATA(unicode);
5406 size = PyUnicode_GET_LENGTH(unicode);
5407
Benjamin Petersonead6b532011-12-20 17:23:42 -06005408 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005409 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005410 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005411 case PyUnicode_1BYTE_KIND:
5412 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5413 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005414 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005415 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005416 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005417 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005418 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420}
5421
Alexander Belopolsky40018472011-02-26 01:02:56 +00005422PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005423_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5424{
5425 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5426}
5427
5428
5429PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005430PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5431 Py_ssize_t size,
5432 const char *errors)
5433{
5434 PyObject *v, *unicode;
5435
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005436 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005437 if (unicode == NULL)
5438 return NULL;
5439 v = _PyUnicode_AsUTF8String(unicode, errors);
5440 Py_DECREF(unicode);
5441 return v;
5442}
5443
5444PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005445PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448}
5449
Walter Dörwald41980ca2007-08-16 21:55:45 +00005450/* --- UTF-32 Codec ------------------------------------------------------- */
5451
5452PyObject *
5453PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 Py_ssize_t size,
5455 const char *errors,
5456 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005457{
5458 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5459}
5460
5461PyObject *
5462PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 Py_ssize_t size,
5464 const char *errors,
5465 int *byteorder,
5466 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005467{
5468 const char *starts = s;
5469 Py_ssize_t startinpos;
5470 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005471 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005472 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005473 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005474 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005475 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005476 PyObject *errorHandler = NULL;
5477 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005478
Walter Dörwald41980ca2007-08-16 21:55:45 +00005479 q = (unsigned char *)s;
5480 e = q + size;
5481
5482 if (byteorder)
5483 bo = *byteorder;
5484
5485 /* Check for BOM marks (U+FEFF) in the input and adjust current
5486 byte order setting accordingly. In native mode, the leading BOM
5487 mark is skipped, in all other modes, it is copied to the output
5488 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005489 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005490 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005491 if (bom == 0x0000FEFF) {
5492 bo = -1;
5493 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005495 else if (bom == 0xFFFE0000) {
5496 bo = 1;
5497 q += 4;
5498 }
5499 if (byteorder)
5500 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005501 }
5502
Victor Stinnere64322e2012-10-30 23:12:47 +01005503 if (q == e) {
5504 if (consumed)
5505 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005506 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005507 }
5508
Victor Stinnere64322e2012-10-30 23:12:47 +01005509#ifdef WORDS_BIGENDIAN
5510 le = bo < 0;
5511#else
5512 le = bo <= 0;
5513#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005515
Victor Stinner8f674cc2013-04-17 23:02:17 +02005516 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005517 writer.min_length = (e - q + 3) / 4;
5518 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005520
Victor Stinnere64322e2012-10-30 23:12:47 +01005521 while (1) {
5522 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005523 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005524
Victor Stinnere64322e2012-10-30 23:12:47 +01005525 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005526 enum PyUnicode_Kind kind = writer.kind;
5527 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005528 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005529 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005530 if (le) {
5531 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005532 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 if (ch > maxch)
5534 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005535 if (kind != PyUnicode_1BYTE_KIND &&
5536 Py_UNICODE_IS_SURROGATE(ch))
5537 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005538 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 q += 4;
5540 } while (q <= last);
5541 }
5542 else {
5543 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005544 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005545 if (ch > maxch)
5546 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005547 if (kind != PyUnicode_1BYTE_KIND &&
5548 Py_UNICODE_IS_SURROGATE(ch))
5549 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005550 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005551 q += 4;
5552 } while (q <= last);
5553 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005554 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005555 }
5556
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005557 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005558 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005559 startinpos = ((const char *)q) - starts;
5560 endinpos = startinpos + 4;
5561 }
5562 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005563 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005565 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005567 startinpos = ((const char *)q) - starts;
5568 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005570 else {
5571 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005572 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005573 goto onError;
5574 q += 4;
5575 continue;
5576 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005577 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 startinpos = ((const char *)q) - starts;
5579 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005581
5582 /* The remaining input chars are ignored if the callback
5583 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005584 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005586 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005588 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590 }
5591
Walter Dörwald41980ca2007-08-16 21:55:45 +00005592 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005594
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 Py_XDECREF(errorHandler);
5596 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005597 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005598
Benjamin Peterson29060642009-01-31 22:14:21 +00005599 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005600 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005601 Py_XDECREF(errorHandler);
5602 Py_XDECREF(exc);
5603 return NULL;
5604}
5605
5606PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005607_PyUnicode_EncodeUTF32(PyObject *str,
5608 const char *errors,
5609 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005610{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 enum PyUnicode_Kind kind;
5612 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005613 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005614 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005615 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005616#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005617 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005618#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005619 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005620#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005621 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005622 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005623 PyObject *errorHandler = NULL;
5624 PyObject *exc = NULL;
5625 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005626
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005627 if (!PyUnicode_Check(str)) {
5628 PyErr_BadArgument();
5629 return NULL;
5630 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005631 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 return NULL;
5633 kind = PyUnicode_KIND(str);
5634 data = PyUnicode_DATA(str);
5635 len = PyUnicode_GET_LENGTH(str);
5636
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005637 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005638 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005639 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005640 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005641 if (v == NULL)
5642 return NULL;
5643
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 /* output buffer is 4-bytes aligned */
5645 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005646 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005647 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005649 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005650 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005651
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005652 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005653 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005654 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005655 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005656 else
5657 encoding = "utf-32";
5658
5659 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005660 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5661 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005662 }
5663
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005664 pos = 0;
5665 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005666 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005667
5668 if (kind == PyUnicode_2BYTE_KIND) {
5669 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5670 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005672 else {
5673 assert(kind == PyUnicode_4BYTE_KIND);
5674 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5675 &out, native_ordering);
5676 }
5677 if (pos == len)
5678 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005679
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 rep = unicode_encode_call_errorhandler(
5681 errors, &errorHandler,
5682 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005683 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005684 if (!rep)
5685 goto error;
5686
5687 if (PyBytes_Check(rep)) {
5688 repsize = PyBytes_GET_SIZE(rep);
5689 if (repsize & 3) {
5690 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005691 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005692 "surrogates not allowed");
5693 goto error;
5694 }
5695 moreunits = repsize / 4;
5696 }
5697 else {
5698 assert(PyUnicode_Check(rep));
5699 if (PyUnicode_READY(rep) < 0)
5700 goto error;
5701 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5702 if (!PyUnicode_IS_ASCII(rep)) {
5703 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005704 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005705 "surrogates not allowed");
5706 goto error;
5707 }
5708 }
5709
5710 /* four bytes are reserved for each surrogate */
5711 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005712 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005713 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005714 /* integer overflow */
5715 PyErr_NoMemory();
5716 goto error;
5717 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005718 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005720 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005721 }
5722
5723 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005724 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005725 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005727 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005728 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5729 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 }
5731
5732 Py_CLEAR(rep);
5733 }
5734
5735 /* Cut back to size actually needed. This is necessary for, for example,
5736 encoding of a string containing isolated surrogates and the 'ignore'
5737 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005738 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 if (nsize != PyBytes_GET_SIZE(v))
5740 _PyBytes_Resize(&v, nsize);
5741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005743 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005744 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005745 error:
5746 Py_XDECREF(rep);
5747 Py_XDECREF(errorHandler);
5748 Py_XDECREF(exc);
5749 Py_XDECREF(v);
5750 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005751}
5752
Alexander Belopolsky40018472011-02-26 01:02:56 +00005753PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5755 Py_ssize_t size,
5756 const char *errors,
5757 int byteorder)
5758{
5759 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005760 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761 if (tmp == NULL)
5762 return NULL;
5763 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5764 Py_DECREF(tmp);
5765 return result;
5766}
5767
5768PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005769PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005770{
Victor Stinnerb960b342011-11-20 19:12:52 +01005771 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005772}
5773
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774/* --- UTF-16 Codec ------------------------------------------------------- */
5775
Tim Peters772747b2001-08-09 22:21:55 +00005776PyObject *
5777PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 Py_ssize_t size,
5779 const char *errors,
5780 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781{
Walter Dörwald69652032004-09-07 20:24:22 +00005782 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5783}
5784
5785PyObject *
5786PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 Py_ssize_t size,
5788 const char *errors,
5789 int *byteorder,
5790 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005793 Py_ssize_t startinpos;
5794 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005795 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005796 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005797 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005798 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005799 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 PyObject *errorHandler = NULL;
5801 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005802 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Tim Peters772747b2001-08-09 22:21:55 +00005804 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005805 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806
5807 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005808 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005810 /* Check for BOM marks (U+FEFF) in the input and adjust current
5811 byte order setting accordingly. In native mode, the leading BOM
5812 mark is skipped, in all other modes, it is copied to the output
5813 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005814 if (bo == 0 && size >= 2) {
5815 const Py_UCS4 bom = (q[1] << 8) | q[0];
5816 if (bom == 0xFEFF) {
5817 q += 2;
5818 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005820 else if (bom == 0xFFFE) {
5821 q += 2;
5822 bo = 1;
5823 }
5824 if (byteorder)
5825 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005826 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827
Antoine Pitrou63065d72012-05-15 23:48:04 +02005828 if (q == e) {
5829 if (consumed)
5830 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005831 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005832 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005833
Christian Heimes743e0cd2012-10-17 23:52:17 +02005834#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005835 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005837#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005838 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005839 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005840#endif
Tim Peters772747b2001-08-09 22:21:55 +00005841
Antoine Pitrou63065d72012-05-15 23:48:04 +02005842 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005843 character count normally. Error handler will take care of
5844 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005845 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005846 writer.min_length = (e - q + 1) / 2;
5847 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005848 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005849
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 while (1) {
5851 Py_UCS4 ch = 0;
5852 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005854 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005856 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005858 native_ordering);
5859 else
5860 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005862 native_ordering);
5863 } else if (kind == PyUnicode_2BYTE_KIND) {
5864 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005865 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005866 native_ordering);
5867 } else {
5868 assert(kind == PyUnicode_4BYTE_KIND);
5869 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005870 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005871 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005872 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005873 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874
Antoine Pitrou63065d72012-05-15 23:48:04 +02005875 switch (ch)
5876 {
5877 case 0:
5878 /* remaining byte at the end? (size should be even) */
5879 if (q == e || consumed)
5880 goto End;
5881 errmsg = "truncated data";
5882 startinpos = ((const char *)q) - starts;
5883 endinpos = ((const char *)e) - starts;
5884 break;
5885 /* The remaining input chars are ignored if the callback
5886 chooses to skip the input */
5887 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005888 q -= 2;
5889 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005890 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005891 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005892 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005893 endinpos = ((const char *)e) - starts;
5894 break;
5895 case 2:
5896 errmsg = "illegal encoding";
5897 startinpos = ((const char *)q) - 2 - starts;
5898 endinpos = startinpos + 2;
5899 break;
5900 case 3:
5901 errmsg = "illegal UTF-16 surrogate";
5902 startinpos = ((const char *)q) - 4 - starts;
5903 endinpos = startinpos + 2;
5904 break;
5905 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005906 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005907 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 continue;
5909 }
5910
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005911 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005912 errors,
5913 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005914 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005915 &starts,
5916 (const char **)&e,
5917 &startinpos,
5918 &endinpos,
5919 &exc,
5920 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005921 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 }
5924
Antoine Pitrou63065d72012-05-15 23:48:04 +02005925End:
Walter Dörwald69652032004-09-07 20:24:22 +00005926 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005928
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 Py_XDECREF(errorHandler);
5930 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005931 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005934 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 Py_XDECREF(errorHandler);
5936 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 return NULL;
5938}
5939
Tim Peters772747b2001-08-09 22:21:55 +00005940PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941_PyUnicode_EncodeUTF16(PyObject *str,
5942 const char *errors,
5943 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005945 enum PyUnicode_Kind kind;
5946 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005947 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005948 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005949 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005950 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005951#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005952 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005953#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005954 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005955#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005956 const char *encoding;
5957 Py_ssize_t nsize, pos;
5958 PyObject *errorHandler = NULL;
5959 PyObject *exc = NULL;
5960 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005961
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005962 if (!PyUnicode_Check(str)) {
5963 PyErr_BadArgument();
5964 return NULL;
5965 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005966 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005967 return NULL;
5968 kind = PyUnicode_KIND(str);
5969 data = PyUnicode_DATA(str);
5970 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005971
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005973 if (kind == PyUnicode_4BYTE_KIND) {
5974 const Py_UCS4 *in = (const Py_UCS4 *)data;
5975 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005976 while (in < end) {
5977 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005978 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005979 }
5980 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005981 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005982 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005984 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005985 nsize = len + pairs + (byteorder == 0);
5986 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005987 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005991 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005992 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005993 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005994 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005995 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005996 }
5997 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005998 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005999 }
Tim Peters772747b2001-08-09 22:21:55 +00006000
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 if (kind == PyUnicode_1BYTE_KIND) {
6002 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6003 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006004 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006005
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006006 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006007 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006008 }
6009 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006010 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006011 }
6012 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006013 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006014 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006015
6016 pos = 0;
6017 while (pos < len) {
6018 Py_ssize_t repsize, moreunits;
6019
6020 if (kind == PyUnicode_2BYTE_KIND) {
6021 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6022 &out, native_ordering);
6023 }
6024 else {
6025 assert(kind == PyUnicode_4BYTE_KIND);
6026 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6027 &out, native_ordering);
6028 }
6029 if (pos == len)
6030 break;
6031
6032 rep = unicode_encode_call_errorhandler(
6033 errors, &errorHandler,
6034 encoding, "surrogates not allowed",
6035 str, &exc, pos, pos + 1, &pos);
6036 if (!rep)
6037 goto error;
6038
6039 if (PyBytes_Check(rep)) {
6040 repsize = PyBytes_GET_SIZE(rep);
6041 if (repsize & 1) {
6042 raise_encode_exception(&exc, encoding,
6043 str, pos - 1, pos,
6044 "surrogates not allowed");
6045 goto error;
6046 }
6047 moreunits = repsize / 2;
6048 }
6049 else {
6050 assert(PyUnicode_Check(rep));
6051 if (PyUnicode_READY(rep) < 0)
6052 goto error;
6053 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6054 if (!PyUnicode_IS_ASCII(rep)) {
6055 raise_encode_exception(&exc, encoding,
6056 str, pos - 1, pos,
6057 "surrogates not allowed");
6058 goto error;
6059 }
6060 }
6061
6062 /* two bytes are reserved for each surrogate */
6063 if (moreunits > 1) {
6064 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006065 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006066 /* integer overflow */
6067 PyErr_NoMemory();
6068 goto error;
6069 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006070 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006071 goto error;
6072 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6073 }
6074
6075 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006076 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006077 out += moreunits;
6078 } else /* rep is unicode */ {
6079 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6080 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6081 &out, native_ordering);
6082 }
6083
6084 Py_CLEAR(rep);
6085 }
6086
6087 /* Cut back to size actually needed. This is necessary for, for example,
6088 encoding of a string containing isolated surrogates and the 'ignore' handler
6089 is used. */
6090 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6091 if (nsize != PyBytes_GET_SIZE(v))
6092 _PyBytes_Resize(&v, nsize);
6093 Py_XDECREF(errorHandler);
6094 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006095 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006096 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006097 error:
6098 Py_XDECREF(rep);
6099 Py_XDECREF(errorHandler);
6100 Py_XDECREF(exc);
6101 Py_XDECREF(v);
6102 return NULL;
6103#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104}
6105
Alexander Belopolsky40018472011-02-26 01:02:56 +00006106PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6108 Py_ssize_t size,
6109 const char *errors,
6110 int byteorder)
6111{
6112 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006113 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114 if (tmp == NULL)
6115 return NULL;
6116 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6117 Py_DECREF(tmp);
6118 return result;
6119}
6120
6121PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006122PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125}
6126
6127/* --- Unicode Escape Codec ----------------------------------------------- */
6128
Fredrik Lundh06d12682001-01-24 07:59:11 +00006129static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006132_PyUnicode_DecodeUnicodeEscape(const char *s,
6133 Py_ssize_t size,
6134 const char *errors,
6135 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006138 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 PyObject *errorHandler = NULL;
6141 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006142
Eric V. Smith42454af2016-10-31 09:22:08 -04006143 // so we can remember if we've seen an invalid escape char or not
6144 *first_invalid_escape = NULL;
6145
Victor Stinner62ec3312016-09-06 17:04:34 -07006146 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006147 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006148 }
6149 /* Escaped strings will always be longer than the resulting
6150 Unicode string, so we start with size here and then reduce the
6151 length after conversion to the true value.
6152 (but if the error callback returns a long replacement string
6153 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006154 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 writer.min_length = size;
6156 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6157 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006158 }
6159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 end = s + size;
6161 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 unsigned char c = (unsigned char) *s++;
6163 Py_UCS4 ch;
6164 int count;
6165 Py_ssize_t startinpos;
6166 Py_ssize_t endinpos;
6167 const char *message;
6168
6169#define WRITE_ASCII_CHAR(ch) \
6170 do { \
6171 assert(ch <= 127); \
6172 assert(writer.pos < writer.size); \
6173 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6174 } while(0)
6175
6176#define WRITE_CHAR(ch) \
6177 do { \
6178 if (ch <= writer.maxchar) { \
6179 assert(writer.pos < writer.size); \
6180 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6181 } \
6182 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6183 goto onError; \
6184 } \
6185 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186
6187 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 if (c != '\\') {
6189 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 continue;
6191 }
6192
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 if (s >= end) {
6196 message = "\\ at end of string";
6197 goto error;
6198 }
6199 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006200
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006202 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006205 case '\n': continue;
6206 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6207 case '\'': WRITE_ASCII_CHAR('\''); continue;
6208 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6209 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006210 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006211 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6212 case 't': WRITE_ASCII_CHAR('\t'); continue;
6213 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6214 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006215 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006216 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006217 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006218 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 case '0': case '1': case '2': case '3':
6222 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006223 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006224 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 ch = (ch<<3) + *s++ - '0';
6226 if (s < end && '0' <= *s && *s <= '7') {
6227 ch = (ch<<3) + *s++ - '0';
6228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 WRITE_CHAR(ch);
6231 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 /* hex escapes */
6234 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006237 message = "truncated \\xXX escape";
6238 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006243 message = "truncated \\uXXXX escape";
6244 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006247 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006249 message = "truncated \\UXXXXXXXX escape";
6250 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006252 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 ch <<= 4;
6254 if (c >= '0' && c <= '9') {
6255 ch += c - '0';
6256 }
6257 else if (c >= 'a' && c <= 'f') {
6258 ch += c - ('a' - 10);
6259 }
6260 else if (c >= 'A' && c <= 'F') {
6261 ch += c - ('A' - 10);
6262 }
6263 else {
6264 break;
6265 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006266 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006268 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006269 }
6270
6271 /* when we get here, ch is a 32-bit unicode character */
6272 if (ch > MAX_UNICODE) {
6273 message = "illegal Unicode character";
6274 goto error;
6275 }
6276
6277 WRITE_CHAR(ch);
6278 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006279
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006281 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006282 if (ucnhash_CAPI == NULL) {
6283 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006284 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6285 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006286 if (ucnhash_CAPI == NULL) {
6287 PyErr_SetString(
6288 PyExc_UnicodeError,
6289 "\\N escapes not supported (can't load unicodedata module)"
6290 );
6291 goto onError;
6292 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006293 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006294
6295 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006296 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006297 const char *start = ++s;
6298 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006299 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006301 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 namelen = s - start;
6303 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006304 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006305 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 ch = 0xffffffff; /* in case 'getcode' messes up */
6307 if (namelen <= INT_MAX &&
6308 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6309 &ch, 0)) {
6310 assert(ch <= MAX_UNICODE);
6311 WRITE_CHAR(ch);
6312 continue;
6313 }
6314 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006315 }
6316 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006317 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006318
6319 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006320 if (*first_invalid_escape == NULL) {
6321 *first_invalid_escape = s-1; /* Back up one char, since we've
6322 already incremented s. */
6323 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006324 WRITE_ASCII_CHAR('\\');
6325 WRITE_CHAR(c);
6326 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006328
6329 error:
6330 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006332 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006333 errors, &errorHandler,
6334 "unicodeescape", message,
6335 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006337 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006339 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006340
6341#undef WRITE_ASCII_CHAR
6342#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006344
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006347 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006348
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006350 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 Py_XDECREF(errorHandler);
6352 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353 return NULL;
6354}
6355
Eric V. Smith42454af2016-10-31 09:22:08 -04006356PyObject *
6357PyUnicode_DecodeUnicodeEscape(const char *s,
6358 Py_ssize_t size,
6359 const char *errors)
6360{
6361 const char *first_invalid_escape;
6362 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6363 &first_invalid_escape);
6364 if (result == NULL)
6365 return NULL;
6366 if (first_invalid_escape != NULL) {
6367 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6368 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006369 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006370 Py_DECREF(result);
6371 return NULL;
6372 }
6373 }
6374 return result;
6375}
6376
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006377/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378
Alexander Belopolsky40018472011-02-26 01:02:56 +00006379PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006380PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006382 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388
Ezio Melottie7f90372012-10-05 03:33:31 +03006389 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006390 escape.
6391
Ezio Melottie7f90372012-10-05 03:33:31 +03006392 For UCS1 strings it's '\xxx', 4 bytes per source character.
6393 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6394 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006395 */
6396
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006397 if (!PyUnicode_Check(unicode)) {
6398 PyErr_BadArgument();
6399 return NULL;
6400 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 }
Victor Stinner358af132015-10-12 22:36:57 +02006404
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006405 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 if (len == 0) {
6407 return PyBytes_FromStringAndSize(NULL, 0);
6408 }
6409
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410 kind = PyUnicode_KIND(unicode);
6411 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6413 bytes, and 1 byte characters 4. */
6414 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006415 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006416 return PyErr_NoMemory();
6417 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006418 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 if (repr == NULL) {
6420 return NULL;
6421 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006425 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006426
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 /* U+0000-U+00ff range */
6428 if (ch < 0x100) {
6429 if (ch >= ' ' && ch < 127) {
6430 if (ch != '\\') {
6431 /* Copy printable US ASCII as-is */
6432 *p++ = (char) ch;
6433 }
6434 /* Escape backslashes */
6435 else {
6436 *p++ = '\\';
6437 *p++ = '\\';
6438 }
6439 }
Victor Stinner358af132015-10-12 22:36:57 +02006440
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 /* Map special whitespace to '\t', \n', '\r' */
6442 else if (ch == '\t') {
6443 *p++ = '\\';
6444 *p++ = 't';
6445 }
6446 else if (ch == '\n') {
6447 *p++ = '\\';
6448 *p++ = 'n';
6449 }
6450 else if (ch == '\r') {
6451 *p++ = '\\';
6452 *p++ = 'r';
6453 }
6454
6455 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6456 else {
6457 *p++ = '\\';
6458 *p++ = 'x';
6459 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6460 *p++ = Py_hexdigits[ch & 0x000F];
6461 }
Tim Petersced69f82003-09-16 20:30:58 +00006462 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006463 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 *p++ = '\\';
6466 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006467 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6468 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6469 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6470 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6473 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006474
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 /* Make sure that the first two digits are zero */
6476 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006477 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 *p++ = 'U';
6479 *p++ = '0';
6480 *p++ = '0';
6481 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6482 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6483 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6484 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6485 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6486 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 assert(p - PyBytes_AS_STRING(repr) > 0);
6491 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6492 return NULL;
6493 }
6494 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495}
6496
Alexander Belopolsky40018472011-02-26 01:02:56 +00006497PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006498PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6499 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006501 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006502 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006503 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 }
6506
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006507 result = PyUnicode_AsUnicodeEscapeString(tmp);
6508 Py_DECREF(tmp);
6509 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510}
6511
6512/* --- Raw Unicode Escape Codec ------------------------------------------- */
6513
Alexander Belopolsky40018472011-02-26 01:02:56 +00006514PyObject *
6515PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006516 Py_ssize_t size,
6517 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006520 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522 PyObject *errorHandler = NULL;
6523 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006524
Victor Stinner62ec3312016-09-06 17:04:34 -07006525 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006526 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006527 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006528
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 /* Escaped strings will always be longer than the resulting
6530 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006531 length after conversion to the true value. (But decoding error
6532 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006533 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006534 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6536 goto onError;
6537 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006538
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 end = s + size;
6540 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006541 unsigned char c = (unsigned char) *s++;
6542 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006543 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006544 Py_ssize_t startinpos;
6545 Py_ssize_t endinpos;
6546 const char *message;
6547
6548#define WRITE_CHAR(ch) \
6549 do { \
6550 if (ch <= writer.maxchar) { \
6551 assert(writer.pos < writer.size); \
6552 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6553 } \
6554 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6555 goto onError; \
6556 } \
6557 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006560 if (c != '\\' || s >= end) {
6561 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006564
Victor Stinner62ec3312016-09-06 17:04:34 -07006565 c = (unsigned char) *s++;
6566 if (c == 'u') {
6567 count = 4;
6568 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 else if (c == 'U') {
6571 count = 8;
6572 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006573 }
6574 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006575 assert(writer.pos < writer.size);
6576 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6577 WRITE_CHAR(c);
6578 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006579 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006580 startinpos = s - starts - 2;
6581
6582 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6583 for (ch = 0; count && s < end; ++s, --count) {
6584 c = (unsigned char)*s;
6585 ch <<= 4;
6586 if (c >= '0' && c <= '9') {
6587 ch += c - '0';
6588 }
6589 else if (c >= 'a' && c <= 'f') {
6590 ch += c - ('a' - 10);
6591 }
6592 else if (c >= 'A' && c <= 'F') {
6593 ch += c - ('A' - 10);
6594 }
6595 else {
6596 break;
6597 }
6598 }
6599 if (!count) {
6600 if (ch <= MAX_UNICODE) {
6601 WRITE_CHAR(ch);
6602 continue;
6603 }
6604 message = "\\Uxxxxxxxx out of range";
6605 }
6606
6607 endinpos = s-starts;
6608 writer.min_length = end - s + writer.pos;
6609 if (unicode_decode_call_errorhandler_writer(
6610 errors, &errorHandler,
6611 "rawunicodeescape", message,
6612 &starts, &end, &startinpos, &endinpos, &exc, &s,
6613 &writer)) {
6614 goto onError;
6615 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006616 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006617
6618#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 Py_XDECREF(errorHandler);
6621 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006622 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006623
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006625 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 Py_XDECREF(errorHandler);
6627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006629
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006632
Alexander Belopolsky40018472011-02-26 01:02:56 +00006633PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006634PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635{
Victor Stinner62ec3312016-09-06 17:04:34 -07006636 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006638 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006639 int kind;
6640 void *data;
6641 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006643 if (!PyUnicode_Check(unicode)) {
6644 PyErr_BadArgument();
6645 return NULL;
6646 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006648 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006650 kind = PyUnicode_KIND(unicode);
6651 data = PyUnicode_DATA(unicode);
6652 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006653 if (kind == PyUnicode_1BYTE_KIND) {
6654 return PyBytes_FromStringAndSize(data, len);
6655 }
Victor Stinner0e368262011-11-10 20:12:49 +01006656
Victor Stinner62ec3312016-09-06 17:04:34 -07006657 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6658 bytes, and 1 byte characters 4. */
6659 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006660
Victor Stinner62ec3312016-09-06 17:04:34 -07006661 if (len > PY_SSIZE_T_MAX / expandsize) {
6662 return PyErr_NoMemory();
6663 }
6664 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6665 if (repr == NULL) {
6666 return NULL;
6667 }
6668 if (len == 0) {
6669 return repr;
6670 }
6671
6672 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006673 for (pos = 0; pos < len; pos++) {
6674 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006675
Victor Stinner62ec3312016-09-06 17:04:34 -07006676 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6677 if (ch < 0x100) {
6678 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006679 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006680 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006681 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 *p++ = '\\';
6683 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006684 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6685 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6686 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6687 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6690 else {
6691 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6692 *p++ = '\\';
6693 *p++ = 'U';
6694 *p++ = '0';
6695 *p++ = '0';
6696 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6697 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6698 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6699 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6700 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6701 *p++ = Py_hexdigits[ch & 15];
6702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006704
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 assert(p > PyBytes_AS_STRING(repr));
6706 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6707 return NULL;
6708 }
6709 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710}
6711
Alexander Belopolsky40018472011-02-26 01:02:56 +00006712PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006713PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6714 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006716 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006717 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006718 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006719 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006720 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6721 Py_DECREF(tmp);
6722 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723}
6724
6725/* --- Latin-1 Codec ------------------------------------------------------ */
6726
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727PyObject *
6728PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006729 Py_ssize_t size,
6730 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006733 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006737static void
6738make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006739 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006740 PyObject *unicode,
6741 Py_ssize_t startpos, Py_ssize_t endpos,
6742 const char *reason)
6743{
6744 if (*exceptionObject == NULL) {
6745 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006746 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006747 encoding, unicode, startpos, endpos, reason);
6748 }
6749 else {
6750 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6751 goto onError;
6752 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6753 goto onError;
6754 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6755 goto onError;
6756 return;
6757 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006758 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006759 }
6760}
6761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763static void
6764raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006765 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006766 PyObject *unicode,
6767 Py_ssize_t startpos, Py_ssize_t endpos,
6768 const char *reason)
6769{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006770 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006771 encoding, unicode, startpos, endpos, reason);
6772 if (*exceptionObject != NULL)
6773 PyCodec_StrictErrors(*exceptionObject);
6774}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006775
6776/* error handling callback helper:
6777 build arguments, call the callback and check the arguments,
6778 put the result into newpos and return the replacement string, which
6779 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006780static PyObject *
6781unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006782 PyObject **errorHandler,
6783 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006785 Py_ssize_t startpos, Py_ssize_t endpos,
6786 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006787{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006788 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006790 PyObject *restuple;
6791 PyObject *resunicode;
6792
6793 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006794 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006796 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006797 }
6798
Benjamin Petersonbac79492012-01-14 13:34:47 -05006799 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006800 return NULL;
6801 len = PyUnicode_GET_LENGTH(unicode);
6802
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006803 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006807
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006808 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006811 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006812 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 Py_DECREF(restuple);
6814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006815 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006816 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 &resunicode, newpos)) {
6818 Py_DECREF(restuple);
6819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006821 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6822 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6823 Py_DECREF(restuple);
6824 return NULL;
6825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006826 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006827 *newpos = len + *newpos;
6828 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006829 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 Py_DECREF(restuple);
6831 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006833 Py_INCREF(resunicode);
6834 Py_DECREF(restuple);
6835 return resunicode;
6836}
6837
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006840 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006841 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006843 /* input state */
6844 Py_ssize_t pos=0, size;
6845 int kind;
6846 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847 /* pointer into the output */
6848 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006849 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6850 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006851 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006853 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006854 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006855 /* output object */
6856 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857
Benjamin Petersonbac79492012-01-14 13:34:47 -05006858 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006859 return NULL;
6860 size = PyUnicode_GET_LENGTH(unicode);
6861 kind = PyUnicode_KIND(unicode);
6862 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 /* allocate enough for a simple encoding without
6864 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006865 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006866 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006867
6868 _PyBytesWriter_Init(&writer);
6869 str = _PyBytesWriter_Alloc(&writer, size);
6870 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006871 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006873 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006874 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006877 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006879 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006880 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006881 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006883 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006885 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006886 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006888
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006889 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006891
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006892 /* Only overallocate the buffer if it's not the last write */
6893 writer.overallocate = (collend < size);
6894
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006896 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006897 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006898
6899 switch (error_handler) {
6900 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006901 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006903
6904 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006905 memset(str, '?', collend - collstart);
6906 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006907 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006908 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006909 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 break;
Victor Stinner50149202015-09-22 00:26:54 +02006911
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006912 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006913 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006914 writer.min_size -= (collend - collstart);
6915 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006916 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006917 if (str == NULL)
6918 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006919 pos = collend;
6920 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006921
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006922 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006923 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006924 writer.min_size -= (collend - collstart);
6925 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006926 unicode, collstart, collend);
6927 if (str == NULL)
6928 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006929 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 break;
Victor Stinner50149202015-09-22 00:26:54 +02006931
Victor Stinnerc3713e92015-09-29 12:32:13 +02006932 case _Py_ERROR_SURROGATEESCAPE:
6933 for (i = collstart; i < collend; ++i) {
6934 ch = PyUnicode_READ(kind, data, i);
6935 if (ch < 0xdc80 || 0xdcff < ch) {
6936 /* Not a UTF-8b surrogate */
6937 break;
6938 }
6939 *str++ = (char)(ch - 0xdc00);
6940 ++pos;
6941 }
6942 if (i >= collend)
6943 break;
6944 collstart = pos;
6945 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006946 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006947
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006949 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6950 encoding, reason, unicode, &exc,
6951 collstart, collend, &newpos);
6952 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006954
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006955 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006956 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006957
Victor Stinner6bd525b2015-10-09 13:10:05 +02006958 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006959 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006960 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006961 PyBytes_AS_STRING(rep),
6962 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006963 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006964 else {
6965 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006966
Victor Stinner6bd525b2015-10-09 13:10:05 +02006967 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006969
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006970 if (limit == 256 ?
6971 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6972 !PyUnicode_IS_ASCII(rep))
6973 {
6974 /* Not all characters are smaller than limit */
6975 raise_encode_exception(&exc, encoding, unicode,
6976 collstart, collend, reason);
6977 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006979 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6980 str = _PyBytesWriter_WriteBytes(&writer, str,
6981 PyUnicode_DATA(rep),
6982 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006984 if (str == NULL)
6985 goto onError;
6986
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006987 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006988 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006989 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006990
6991 /* If overallocation was disabled, ensure that it was the last
6992 write. Otherwise, we missed an optimization */
6993 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994 }
6995 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006996
Victor Stinner50149202015-09-22 00:26:54 +02006997 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006999 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007000
7001 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007002 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007003 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007004 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007005 Py_XDECREF(exc);
7006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007007}
7008
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007009/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007010PyObject *
7011PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007012 Py_ssize_t size,
7013 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007015 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007016 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007017 if (unicode == NULL)
7018 return NULL;
7019 result = unicode_encode_ucs1(unicode, errors, 256);
7020 Py_DECREF(unicode);
7021 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022}
7023
Alexander Belopolsky40018472011-02-26 01:02:56 +00007024PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007025_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026{
7027 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 PyErr_BadArgument();
7029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007031 if (PyUnicode_READY(unicode) == -1)
7032 return NULL;
7033 /* Fast path: if it is a one-byte string, construct
7034 bytes object directly. */
7035 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7036 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7037 PyUnicode_GET_LENGTH(unicode));
7038 /* Non-Latin-1 characters present. Defer to above function to
7039 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007040 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041}
7042
7043PyObject*
7044PyUnicode_AsLatin1String(PyObject *unicode)
7045{
7046 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047}
7048
7049/* --- 7-bit ASCII Codec -------------------------------------------------- */
7050
Alexander Belopolsky40018472011-02-26 01:02:56 +00007051PyObject *
7052PyUnicode_DecodeASCII(const char *s,
7053 Py_ssize_t size,
7054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007057 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007058 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007060 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007061
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007063 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007064
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007066 if (size == 1 && (unsigned char)s[0] < 128)
7067 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007068
Inada Naoki770847a2019-06-24 12:30:24 +09007069 // Shortcut for simple case
7070 PyObject *u = PyUnicode_New(size, 127);
7071 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007072 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007073 }
7074 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7075 if (outpos == size) {
7076 return u;
7077 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007078
Inada Naoki770847a2019-06-24 12:30:24 +09007079 _PyUnicodeWriter writer;
7080 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007081 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007082
Inada Naoki770847a2019-06-24 12:30:24 +09007083 s += outpos;
7084 int kind = writer.kind;
7085 void *data = writer.data;
7086 Py_ssize_t startinpos, endinpos;
7087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007088 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007089 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007091 PyUnicode_WRITE(kind, data, writer.pos, c);
7092 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007094 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007096
7097 /* byte outsize range 0x00..0x7f: call the error handler */
7098
7099 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007100 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007101
7102 switch (error_handler)
7103 {
7104 case _Py_ERROR_REPLACE:
7105 case _Py_ERROR_SURROGATEESCAPE:
7106 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007107 but we may switch to UCS2 at the first write */
7108 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7109 goto onError;
7110 kind = writer.kind;
7111 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007112
7113 if (error_handler == _Py_ERROR_REPLACE)
7114 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7115 else
7116 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7117 writer.pos++;
7118 ++s;
7119 break;
7120
7121 case _Py_ERROR_IGNORE:
7122 ++s;
7123 break;
7124
7125 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 startinpos = s-starts;
7127 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007128 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007129 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 "ascii", "ordinal not in range(128)",
7131 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007132 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007134 kind = writer.kind;
7135 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007138 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007140 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007141
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007143 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007144 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007145 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 return NULL;
7147}
7148
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007149/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007150PyObject *
7151PyUnicode_EncodeASCII(const Py_UNICODE *p,
7152 Py_ssize_t size,
7153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007155 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007156 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007157 if (unicode == NULL)
7158 return NULL;
7159 result = unicode_encode_ucs1(unicode, errors, 128);
7160 Py_DECREF(unicode);
7161 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162}
7163
Alexander Belopolsky40018472011-02-26 01:02:56 +00007164PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007165_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166{
7167 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 PyErr_BadArgument();
7169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007171 if (PyUnicode_READY(unicode) == -1)
7172 return NULL;
7173 /* Fast path: if it is an ASCII-only string, construct bytes object
7174 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007175 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007176 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7177 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007178 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007179}
7180
7181PyObject *
7182PyUnicode_AsASCIIString(PyObject *unicode)
7183{
7184 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185}
7186
Steve Dowercc16be82016-09-08 10:35:16 -07007187#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007188
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007189/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007190
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007191#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192#define NEED_RETRY
7193#endif
7194
Steve Dower7ebdda02019-08-21 16:22:33 -07007195/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7196 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7197 both cases also and avoids partial characters overrunning the
7198 length limit in MultiByteToWideChar on Windows */
7199#define DECODING_CHUNK_SIZE (INT_MAX/4)
7200
Victor Stinner3a50e702011-10-18 21:21:00 +02007201#ifndef WC_ERR_INVALID_CHARS
7202# define WC_ERR_INVALID_CHARS 0x0080
7203#endif
7204
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007205static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007206code_page_name(UINT code_page, PyObject **obj)
7207{
7208 *obj = NULL;
7209 if (code_page == CP_ACP)
7210 return "mbcs";
7211 if (code_page == CP_UTF7)
7212 return "CP_UTF7";
7213 if (code_page == CP_UTF8)
7214 return "CP_UTF8";
7215
7216 *obj = PyBytes_FromFormat("cp%u", code_page);
7217 if (*obj == NULL)
7218 return NULL;
7219 return PyBytes_AS_STRING(*obj);
7220}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221
Victor Stinner3a50e702011-10-18 21:21:00 +02007222static DWORD
7223decode_code_page_flags(UINT code_page)
7224{
7225 if (code_page == CP_UTF7) {
7226 /* The CP_UTF7 decoder only supports flags=0 */
7227 return 0;
7228 }
7229 else
7230 return MB_ERR_INVALID_CHARS;
7231}
7232
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 * Decode a byte string from a Windows code page into unicode object in strict
7235 * mode.
7236 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007237 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7238 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007240static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007241decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007242 wchar_t **buf,
7243 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 const char *in,
7245 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007247 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007248 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007250
7251 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007253 while ((outsize = MultiByteToWideChar(code_page, flags,
7254 in, insize, NULL, 0)) <= 0)
7255 {
7256 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7257 goto error;
7258 }
7259 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7260 flags = 0;
7261 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007263 /* Extend a wchar_t* buffer */
7264 Py_ssize_t n = *bufsize; /* Get the current length */
7265 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7266 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007267 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007268 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269
7270 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7272 if (outsize <= 0)
7273 goto error;
7274 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007275
Victor Stinner3a50e702011-10-18 21:21:00 +02007276error:
7277 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7278 return -2;
7279 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007280 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281}
7282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283/*
7284 * Decode a byte string from a code page into unicode object with an error
7285 * handler.
7286 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007287 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 * UnicodeDecodeError exception and returns -1 on error.
7289 */
7290static int
7291decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007292 wchar_t **buf,
7293 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007294 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007295 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007296{
7297 const char *startin = in;
7298 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007299 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 /* Ideally, we should get reason from FormatMessage. This is the Windows
7301 2000 English version of the message. */
7302 const char *reason = "No mapping for the Unicode character exists "
7303 "in the target code page.";
7304 /* each step cannot decode more than 1 character, but a character can be
7305 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007306 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007307 int insize;
7308 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 PyObject *errorHandler = NULL;
7310 PyObject *exc = NULL;
7311 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007312 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007313 DWORD err;
7314 int ret = -1;
7315
7316 assert(size > 0);
7317
7318 encoding = code_page_name(code_page, &encoding_obj);
7319 if (encoding == NULL)
7320 return -1;
7321
Victor Stinner7d00cc12014-03-17 23:08:06 +01007322 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7324 UnicodeDecodeError. */
7325 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7326 if (exc != NULL) {
7327 PyCodec_StrictErrors(exc);
7328 Py_CLEAR(exc);
7329 }
7330 goto error;
7331 }
7332
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007333 /* Extend a wchar_t* buffer */
7334 Py_ssize_t n = *bufsize; /* Get the current length */
7335 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7336 PyErr_NoMemory();
7337 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007339 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7340 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007342 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343
7344 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 while (in < endin)
7346 {
7347 /* Decode a character */
7348 insize = 1;
7349 do
7350 {
7351 outsize = MultiByteToWideChar(code_page, flags,
7352 in, insize,
7353 buffer, Py_ARRAY_LENGTH(buffer));
7354 if (outsize > 0)
7355 break;
7356 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007357 if (err == ERROR_INVALID_FLAGS && flags) {
7358 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7359 flags = 0;
7360 continue;
7361 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 if (err != ERROR_NO_UNICODE_TRANSLATION
7363 && err != ERROR_INSUFFICIENT_BUFFER)
7364 {
7365 PyErr_SetFromWindowsErr(0);
7366 goto error;
7367 }
7368 insize++;
7369 }
7370 /* 4=maximum length of a UTF-8 sequence */
7371 while (insize <= 4 && (in + insize) <= endin);
7372
7373 if (outsize <= 0) {
7374 Py_ssize_t startinpos, endinpos, outpos;
7375
Victor Stinner7d00cc12014-03-17 23:08:06 +01007376 /* last character in partial decode? */
7377 if (in + insize >= endin && !final)
7378 break;
7379
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 startinpos = in - startin;
7381 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007382 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007383 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 errors, &errorHandler,
7385 encoding, reason,
7386 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007387 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 {
7389 goto error;
7390 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007391 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 }
7393 else {
7394 in += insize;
7395 memcpy(out, buffer, outsize * sizeof(wchar_t));
7396 out += outsize;
7397 }
7398 }
7399
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007400 /* Shrink the buffer */
7401 assert(out - *buf <= *bufsize);
7402 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007403 /* (in - startin) <= size and size is an int */
7404 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007405
7406error:
7407 Py_XDECREF(encoding_obj);
7408 Py_XDECREF(errorHandler);
7409 Py_XDECREF(exc);
7410 return ret;
7411}
7412
Victor Stinner3a50e702011-10-18 21:21:00 +02007413static PyObject *
7414decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007415 const char *s, Py_ssize_t size,
7416 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007418 wchar_t *buf = NULL;
7419 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007420 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 if (code_page < 0) {
7423 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7424 return NULL;
7425 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007426 if (size < 0) {
7427 PyErr_BadInternalCall();
7428 return NULL;
7429 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007430
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433
Victor Stinner76a31a62011-11-04 00:05:13 +01007434 do
7435 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007437 if (size > DECODING_CHUNK_SIZE) {
7438 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007439 final = 0;
7440 done = 0;
7441 }
7442 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007443#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007444 {
7445 chunk_size = (int)size;
7446 final = (consumed == NULL);
7447 done = 1;
7448 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449
Victor Stinner76a31a62011-11-04 00:05:13 +01007450 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007451 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007452 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007453 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007454 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007455
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007456 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 s, chunk_size);
7458 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007459 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007460 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007461 errors, final);
7462 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007463
7464 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007466 return NULL;
7467 }
7468
7469 if (consumed)
7470 *consumed += converted;
7471
7472 s += converted;
7473 size -= converted;
7474 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007475
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007476 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7477 PyMem_Free(buf);
7478 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479}
7480
Alexander Belopolsky40018472011-02-26 01:02:56 +00007481PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007482PyUnicode_DecodeCodePageStateful(int code_page,
7483 const char *s,
7484 Py_ssize_t size,
7485 const char *errors,
7486 Py_ssize_t *consumed)
7487{
7488 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7489}
7490
7491PyObject *
7492PyUnicode_DecodeMBCSStateful(const char *s,
7493 Py_ssize_t size,
7494 const char *errors,
7495 Py_ssize_t *consumed)
7496{
7497 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7498}
7499
7500PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007501PyUnicode_DecodeMBCS(const char *s,
7502 Py_ssize_t size,
7503 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007505 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7506}
7507
Victor Stinner3a50e702011-10-18 21:21:00 +02007508static DWORD
7509encode_code_page_flags(UINT code_page, const char *errors)
7510{
7511 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007512 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 }
7514 else if (code_page == CP_UTF7) {
7515 /* CP_UTF7 only supports flags=0 */
7516 return 0;
7517 }
7518 else {
7519 if (errors != NULL && strcmp(errors, "replace") == 0)
7520 return 0;
7521 else
7522 return WC_NO_BEST_FIT_CHARS;
7523 }
7524}
7525
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 * Encode a Unicode string to a Windows code page into a byte string in strict
7528 * mode.
7529 *
7530 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007531 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007533static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007534encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537{
Victor Stinner554f3f02010-06-16 23:33:54 +00007538 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 BOOL *pusedDefaultChar = &usedDefaultChar;
7540 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007541 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007542 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 const DWORD flags = encode_code_page_flags(code_page, NULL);
7544 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007545 /* Create a substring so that we can get the UTF-16 representation
7546 of just the slice under consideration. */
7547 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007548
Martin v. Löwis3d325192011-11-04 18:23:06 +01007549 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007550
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007552 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007554 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007555
Victor Stinner2fc507f2011-11-04 20:06:39 +01007556 substring = PyUnicode_Substring(unicode, offset, offset+len);
7557 if (substring == NULL)
7558 return -1;
7559 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7560 if (p == NULL) {
7561 Py_DECREF(substring);
7562 return -1;
7563 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007564 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007565
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007566 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007568 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 NULL, 0,
7570 NULL, pusedDefaultChar);
7571 if (outsize <= 0)
7572 goto error;
7573 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007574 if (pusedDefaultChar && *pusedDefaultChar) {
7575 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007577 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007578
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007582 if (*outbytes == NULL) {
7583 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007585 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007587 }
7588 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 const Py_ssize_t n = PyBytes_Size(*outbytes);
7591 if (outsize > PY_SSIZE_T_MAX - n) {
7592 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007593 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007596 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7597 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007599 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007601 }
7602
7603 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007605 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 out, outsize,
7607 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007608 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 if (outsize <= 0)
7610 goto error;
7611 if (pusedDefaultChar && *pusedDefaultChar)
7612 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007613 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007614
Victor Stinner3a50e702011-10-18 21:21:00 +02007615error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007616 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7618 return -2;
7619 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007620 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007621}
7622
Victor Stinner3a50e702011-10-18 21:21:00 +02007623/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007624 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007625 * error handler.
7626 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007627 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 * -1 on other error.
7629 */
7630static int
7631encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007632 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007633 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007634{
Victor Stinner3a50e702011-10-18 21:21:00 +02007635 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007636 Py_ssize_t pos = unicode_offset;
7637 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 /* Ideally, we should get reason from FormatMessage. This is the Windows
7639 2000 English version of the message. */
7640 const char *reason = "invalid character";
7641 /* 4=maximum length of a UTF-8 sequence */
7642 char buffer[4];
7643 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7644 Py_ssize_t outsize;
7645 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 PyObject *errorHandler = NULL;
7647 PyObject *exc = NULL;
7648 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007649 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007650 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 PyObject *rep;
7652 int ret = -1;
7653
7654 assert(insize > 0);
7655
7656 encoding = code_page_name(code_page, &encoding_obj);
7657 if (encoding == NULL)
7658 return -1;
7659
7660 if (errors == NULL || strcmp(errors, "strict") == 0) {
7661 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7662 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007663 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 if (exc != NULL) {
7665 PyCodec_StrictErrors(exc);
7666 Py_DECREF(exc);
7667 }
7668 Py_XDECREF(encoding_obj);
7669 return -1;
7670 }
7671
7672 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7673 pusedDefaultChar = &usedDefaultChar;
7674 else
7675 pusedDefaultChar = NULL;
7676
7677 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7678 PyErr_NoMemory();
7679 goto error;
7680 }
7681 outsize = insize * Py_ARRAY_LENGTH(buffer);
7682
7683 if (*outbytes == NULL) {
7684 /* Create string object */
7685 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7686 if (*outbytes == NULL)
7687 goto error;
7688 out = PyBytes_AS_STRING(*outbytes);
7689 }
7690 else {
7691 /* Extend string object */
7692 Py_ssize_t n = PyBytes_Size(*outbytes);
7693 if (n > PY_SSIZE_T_MAX - outsize) {
7694 PyErr_NoMemory();
7695 goto error;
7696 }
7697 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7698 goto error;
7699 out = PyBytes_AS_STRING(*outbytes) + n;
7700 }
7701
7702 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007703 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007704 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007705 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7706 wchar_t chars[2];
7707 int charsize;
7708 if (ch < 0x10000) {
7709 chars[0] = (wchar_t)ch;
7710 charsize = 1;
7711 }
7712 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007713 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7714 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007715 charsize = 2;
7716 }
7717
Victor Stinner3a50e702011-10-18 21:21:00 +02007718 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007719 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 buffer, Py_ARRAY_LENGTH(buffer),
7721 NULL, pusedDefaultChar);
7722 if (outsize > 0) {
7723 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7724 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007725 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007726 memcpy(out, buffer, outsize);
7727 out += outsize;
7728 continue;
7729 }
7730 }
7731 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7732 PyErr_SetFromWindowsErr(0);
7733 goto error;
7734 }
7735
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 rep = unicode_encode_call_errorhandler(
7737 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007739 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 if (rep == NULL)
7741 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007743
7744 if (PyBytes_Check(rep)) {
7745 outsize = PyBytes_GET_SIZE(rep);
7746 if (outsize != 1) {
7747 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7748 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7749 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7750 Py_DECREF(rep);
7751 goto error;
7752 }
7753 out = PyBytes_AS_STRING(*outbytes) + offset;
7754 }
7755 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7756 out += outsize;
7757 }
7758 else {
7759 Py_ssize_t i;
7760 enum PyUnicode_Kind kind;
7761 void *data;
7762
Benjamin Petersonbac79492012-01-14 13:34:47 -05007763 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007764 Py_DECREF(rep);
7765 goto error;
7766 }
7767
7768 outsize = PyUnicode_GET_LENGTH(rep);
7769 if (outsize != 1) {
7770 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7771 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7772 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7773 Py_DECREF(rep);
7774 goto error;
7775 }
7776 out = PyBytes_AS_STRING(*outbytes) + offset;
7777 }
7778 kind = PyUnicode_KIND(rep);
7779 data = PyUnicode_DATA(rep);
7780 for (i=0; i < outsize; i++) {
7781 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7782 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007783 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 encoding, unicode,
7785 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007786 "unable to encode error handler result to ASCII");
7787 Py_DECREF(rep);
7788 goto error;
7789 }
7790 *out = (unsigned char)ch;
7791 out++;
7792 }
7793 }
7794 Py_DECREF(rep);
7795 }
7796 /* write a NUL byte */
7797 *out = 0;
7798 outsize = out - PyBytes_AS_STRING(*outbytes);
7799 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7800 if (_PyBytes_Resize(outbytes, outsize) < 0)
7801 goto error;
7802 ret = 0;
7803
7804error:
7805 Py_XDECREF(encoding_obj);
7806 Py_XDECREF(errorHandler);
7807 Py_XDECREF(exc);
7808 return ret;
7809}
7810
Victor Stinner3a50e702011-10-18 21:21:00 +02007811static PyObject *
7812encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007813 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007814 const char *errors)
7815{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007816 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007817 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007818 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007819 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007820
Victor Stinner29dacf22015-01-26 16:41:32 +01007821 if (!PyUnicode_Check(unicode)) {
7822 PyErr_BadArgument();
7823 return NULL;
7824 }
7825
Benjamin Petersonbac79492012-01-14 13:34:47 -05007826 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007827 return NULL;
7828 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007829
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 if (code_page < 0) {
7831 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7832 return NULL;
7833 }
7834
Martin v. Löwis3d325192011-11-04 18:23:06 +01007835 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007836 return PyBytes_FromStringAndSize(NULL, 0);
7837
Victor Stinner7581cef2011-11-03 22:32:33 +01007838 offset = 0;
7839 do
7840 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007841#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007842 if (len > DECODING_CHUNK_SIZE) {
7843 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007844 done = 0;
7845 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007846 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007847#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007848 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007849 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007850 done = 1;
7851 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007852
Victor Stinner76a31a62011-11-04 00:05:13 +01007853 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007854 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007855 errors);
7856 if (ret == -2)
7857 ret = encode_code_page_errors(code_page, &outbytes,
7858 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007859 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007860 if (ret < 0) {
7861 Py_XDECREF(outbytes);
7862 return NULL;
7863 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007864
Victor Stinner7581cef2011-11-03 22:32:33 +01007865 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007866 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007867 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007868
Victor Stinner3a50e702011-10-18 21:21:00 +02007869 return outbytes;
7870}
7871
7872PyObject *
7873PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7874 Py_ssize_t size,
7875 const char *errors)
7876{
Victor Stinner7581cef2011-11-03 22:32:33 +01007877 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007878 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007879 if (unicode == NULL)
7880 return NULL;
7881 res = encode_code_page(CP_ACP, unicode, errors);
7882 Py_DECREF(unicode);
7883 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007884}
7885
7886PyObject *
7887PyUnicode_EncodeCodePage(int code_page,
7888 PyObject *unicode,
7889 const char *errors)
7890{
Victor Stinner7581cef2011-11-03 22:32:33 +01007891 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007892}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007893
Alexander Belopolsky40018472011-02-26 01:02:56 +00007894PyObject *
7895PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007896{
Victor Stinner7581cef2011-11-03 22:32:33 +01007897 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007898}
7899
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007900#undef NEED_RETRY
7901
Steve Dowercc16be82016-09-08 10:35:16 -07007902#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007903
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904/* --- Character Mapping Codec -------------------------------------------- */
7905
Victor Stinnerfb161b12013-04-18 01:44:27 +02007906static int
7907charmap_decode_string(const char *s,
7908 Py_ssize_t size,
7909 PyObject *mapping,
7910 const char *errors,
7911 _PyUnicodeWriter *writer)
7912{
7913 const char *starts = s;
7914 const char *e;
7915 Py_ssize_t startinpos, endinpos;
7916 PyObject *errorHandler = NULL, *exc = NULL;
7917 Py_ssize_t maplen;
7918 enum PyUnicode_Kind mapkind;
7919 void *mapdata;
7920 Py_UCS4 x;
7921 unsigned char ch;
7922
7923 if (PyUnicode_READY(mapping) == -1)
7924 return -1;
7925
7926 maplen = PyUnicode_GET_LENGTH(mapping);
7927 mapdata = PyUnicode_DATA(mapping);
7928 mapkind = PyUnicode_KIND(mapping);
7929
7930 e = s + size;
7931
7932 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7933 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7934 * is disabled in encoding aliases, latin1 is preferred because
7935 * its implementation is faster. */
7936 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7937 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7938 Py_UCS4 maxchar = writer->maxchar;
7939
7940 assert (writer->kind == PyUnicode_1BYTE_KIND);
7941 while (s < e) {
7942 ch = *s;
7943 x = mapdata_ucs1[ch];
7944 if (x > maxchar) {
7945 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7946 goto onError;
7947 maxchar = writer->maxchar;
7948 outdata = (Py_UCS1 *)writer->data;
7949 }
7950 outdata[writer->pos] = x;
7951 writer->pos++;
7952 ++s;
7953 }
7954 return 0;
7955 }
7956
7957 while (s < e) {
7958 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7959 enum PyUnicode_Kind outkind = writer->kind;
7960 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7961 if (outkind == PyUnicode_1BYTE_KIND) {
7962 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7963 Py_UCS4 maxchar = writer->maxchar;
7964 while (s < e) {
7965 ch = *s;
7966 x = mapdata_ucs2[ch];
7967 if (x > maxchar)
7968 goto Error;
7969 outdata[writer->pos] = x;
7970 writer->pos++;
7971 ++s;
7972 }
7973 break;
7974 }
7975 else if (outkind == PyUnicode_2BYTE_KIND) {
7976 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7977 while (s < e) {
7978 ch = *s;
7979 x = mapdata_ucs2[ch];
7980 if (x == 0xFFFE)
7981 goto Error;
7982 outdata[writer->pos] = x;
7983 writer->pos++;
7984 ++s;
7985 }
7986 break;
7987 }
7988 }
7989 ch = *s;
7990
7991 if (ch < maplen)
7992 x = PyUnicode_READ(mapkind, mapdata, ch);
7993 else
7994 x = 0xfffe; /* invalid value */
7995Error:
7996 if (x == 0xfffe)
7997 {
7998 /* undefined mapping */
7999 startinpos = s-starts;
8000 endinpos = startinpos+1;
8001 if (unicode_decode_call_errorhandler_writer(
8002 errors, &errorHandler,
8003 "charmap", "character maps to <undefined>",
8004 &starts, &e, &startinpos, &endinpos, &exc, &s,
8005 writer)) {
8006 goto onError;
8007 }
8008 continue;
8009 }
8010
8011 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8012 goto onError;
8013 ++s;
8014 }
8015 Py_XDECREF(errorHandler);
8016 Py_XDECREF(exc);
8017 return 0;
8018
8019onError:
8020 Py_XDECREF(errorHandler);
8021 Py_XDECREF(exc);
8022 return -1;
8023}
8024
8025static int
8026charmap_decode_mapping(const char *s,
8027 Py_ssize_t size,
8028 PyObject *mapping,
8029 const char *errors,
8030 _PyUnicodeWriter *writer)
8031{
8032 const char *starts = s;
8033 const char *e;
8034 Py_ssize_t startinpos, endinpos;
8035 PyObject *errorHandler = NULL, *exc = NULL;
8036 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008037 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008038
8039 e = s + size;
8040
8041 while (s < e) {
8042 ch = *s;
8043
8044 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8045 key = PyLong_FromLong((long)ch);
8046 if (key == NULL)
8047 goto onError;
8048
8049 item = PyObject_GetItem(mapping, key);
8050 Py_DECREF(key);
8051 if (item == NULL) {
8052 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8053 /* No mapping found means: mapping is undefined. */
8054 PyErr_Clear();
8055 goto Undefined;
8056 } else
8057 goto onError;
8058 }
8059
8060 /* Apply mapping */
8061 if (item == Py_None)
8062 goto Undefined;
8063 if (PyLong_Check(item)) {
8064 long value = PyLong_AS_LONG(item);
8065 if (value == 0xFFFE)
8066 goto Undefined;
8067 if (value < 0 || value > MAX_UNICODE) {
8068 PyErr_Format(PyExc_TypeError,
8069 "character mapping must be in range(0x%lx)",
8070 (unsigned long)MAX_UNICODE + 1);
8071 goto onError;
8072 }
8073
8074 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8075 goto onError;
8076 }
8077 else if (PyUnicode_Check(item)) {
8078 if (PyUnicode_READY(item) == -1)
8079 goto onError;
8080 if (PyUnicode_GET_LENGTH(item) == 1) {
8081 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8082 if (value == 0xFFFE)
8083 goto Undefined;
8084 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8085 goto onError;
8086 }
8087 else {
8088 writer->overallocate = 1;
8089 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8090 goto onError;
8091 }
8092 }
8093 else {
8094 /* wrong return value */
8095 PyErr_SetString(PyExc_TypeError,
8096 "character mapping must return integer, None or str");
8097 goto onError;
8098 }
8099 Py_CLEAR(item);
8100 ++s;
8101 continue;
8102
8103Undefined:
8104 /* undefined mapping */
8105 Py_CLEAR(item);
8106 startinpos = s-starts;
8107 endinpos = startinpos+1;
8108 if (unicode_decode_call_errorhandler_writer(
8109 errors, &errorHandler,
8110 "charmap", "character maps to <undefined>",
8111 &starts, &e, &startinpos, &endinpos, &exc, &s,
8112 writer)) {
8113 goto onError;
8114 }
8115 }
8116 Py_XDECREF(errorHandler);
8117 Py_XDECREF(exc);
8118 return 0;
8119
8120onError:
8121 Py_XDECREF(item);
8122 Py_XDECREF(errorHandler);
8123 Py_XDECREF(exc);
8124 return -1;
8125}
8126
Alexander Belopolsky40018472011-02-26 01:02:56 +00008127PyObject *
8128PyUnicode_DecodeCharmap(const char *s,
8129 Py_ssize_t size,
8130 PyObject *mapping,
8131 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008133 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008134
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 /* Default to Latin-1 */
8136 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008140 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008141 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008142 writer.min_length = size;
8143 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008145
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008146 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008147 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8148 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008149 }
8150 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008151 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008154 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008155
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008157 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 return NULL;
8159}
8160
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161/* Charmap encoding: the lookup table */
8162
Alexander Belopolsky40018472011-02-26 01:02:56 +00008163struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 PyObject_HEAD
8165 unsigned char level1[32];
8166 int count2, count3;
8167 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008168};
8169
8170static PyObject*
8171encoding_map_size(PyObject *obj, PyObject* args)
8172{
8173 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008174 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176}
8177
8178static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 PyDoc_STR("Return the size (in bytes) of this object") },
8181 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182};
8183
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 "EncodingMap", /*tp_name*/
8187 sizeof(struct encoding_map), /*tp_basicsize*/
8188 0, /*tp_itemsize*/
8189 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008190 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008191 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 0, /*tp_getattr*/
8193 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008194 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 0, /*tp_repr*/
8196 0, /*tp_as_number*/
8197 0, /*tp_as_sequence*/
8198 0, /*tp_as_mapping*/
8199 0, /*tp_hash*/
8200 0, /*tp_call*/
8201 0, /*tp_str*/
8202 0, /*tp_getattro*/
8203 0, /*tp_setattro*/
8204 0, /*tp_as_buffer*/
8205 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8206 0, /*tp_doc*/
8207 0, /*tp_traverse*/
8208 0, /*tp_clear*/
8209 0, /*tp_richcompare*/
8210 0, /*tp_weaklistoffset*/
8211 0, /*tp_iter*/
8212 0, /*tp_iternext*/
8213 encoding_map_methods, /*tp_methods*/
8214 0, /*tp_members*/
8215 0, /*tp_getset*/
8216 0, /*tp_base*/
8217 0, /*tp_dict*/
8218 0, /*tp_descr_get*/
8219 0, /*tp_descr_set*/
8220 0, /*tp_dictoffset*/
8221 0, /*tp_init*/
8222 0, /*tp_alloc*/
8223 0, /*tp_new*/
8224 0, /*tp_free*/
8225 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226};
8227
8228PyObject*
8229PyUnicode_BuildEncodingMap(PyObject* string)
8230{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231 PyObject *result;
8232 struct encoding_map *mresult;
8233 int i;
8234 int need_dict = 0;
8235 unsigned char level1[32];
8236 unsigned char level2[512];
8237 unsigned char *mlevel1, *mlevel2, *mlevel3;
8238 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 int kind;
8240 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008241 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008243
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245 PyErr_BadArgument();
8246 return NULL;
8247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 kind = PyUnicode_KIND(string);
8249 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008250 length = PyUnicode_GET_LENGTH(string);
8251 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008252 memset(level1, 0xFF, sizeof level1);
8253 memset(level2, 0xFF, sizeof level2);
8254
8255 /* If there isn't a one-to-one mapping of NULL to \0,
8256 or if there are non-BMP characters, we need to use
8257 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008260 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 ch = PyUnicode_READ(kind, data, i);
8263 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 need_dict = 1;
8265 break;
8266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268 /* unmapped character */
8269 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 l1 = ch >> 11;
8271 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272 if (level1[l1] == 0xFF)
8273 level1[l1] = count2++;
8274 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008276 }
8277
8278 if (count2 >= 0xFF || count3 >= 0xFF)
8279 need_dict = 1;
8280
8281 if (need_dict) {
8282 PyObject *result = PyDict_New();
8283 PyObject *key, *value;
8284 if (!result)
8285 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008286 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008288 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289 if (!key || !value)
8290 goto failed1;
8291 if (PyDict_SetItem(result, key, value) == -1)
8292 goto failed1;
8293 Py_DECREF(key);
8294 Py_DECREF(value);
8295 }
8296 return result;
8297 failed1:
8298 Py_XDECREF(key);
8299 Py_XDECREF(value);
8300 Py_DECREF(result);
8301 return NULL;
8302 }
8303
8304 /* Create a three-level trie */
8305 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8306 16*count2 + 128*count3 - 1);
8307 if (!result)
8308 return PyErr_NoMemory();
8309 PyObject_Init(result, &EncodingMapType);
8310 mresult = (struct encoding_map*)result;
8311 mresult->count2 = count2;
8312 mresult->count3 = count3;
8313 mlevel1 = mresult->level1;
8314 mlevel2 = mresult->level23;
8315 mlevel3 = mresult->level23 + 16*count2;
8316 memcpy(mlevel1, level1, 32);
8317 memset(mlevel2, 0xFF, 16*count2);
8318 memset(mlevel3, 0, 128*count3);
8319 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008320 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008321 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008322 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8323 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 /* unmapped character */
8325 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008326 o1 = ch>>11;
8327 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 i2 = 16*mlevel1[o1] + o2;
8329 if (mlevel2[i2] == 0xFF)
8330 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008331 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332 i3 = 128*mlevel2[i2] + o3;
8333 mlevel3[i3] = i;
8334 }
8335 return result;
8336}
8337
8338static int
Victor Stinner22168992011-11-20 17:09:18 +01008339encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340{
8341 struct encoding_map *map = (struct encoding_map*)mapping;
8342 int l1 = c>>11;
8343 int l2 = (c>>7) & 0xF;
8344 int l3 = c & 0x7F;
8345 int i;
8346
Victor Stinner22168992011-11-20 17:09:18 +01008347 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349 if (c == 0)
8350 return 0;
8351 /* level 1*/
8352 i = map->level1[l1];
8353 if (i == 0xFF) {
8354 return -1;
8355 }
8356 /* level 2*/
8357 i = map->level23[16*i+l2];
8358 if (i == 0xFF) {
8359 return -1;
8360 }
8361 /* level 3 */
8362 i = map->level23[16*map->count2 + 128*i + l3];
8363 if (i == 0) {
8364 return -1;
8365 }
8366 return i;
8367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* Lookup the character ch in the mapping. If the character
8370 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008371 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008372static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008373charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374{
Christian Heimes217cfd12007-12-02 14:31:20 +00008375 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 PyObject *x;
8377
8378 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 x = PyObject_GetItem(mapping, w);
8381 Py_DECREF(w);
8382 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8384 /* No mapping found means: mapping is undefined. */
8385 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008386 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 } else
8388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008390 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008392 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 long value = PyLong_AS_LONG(x);
8394 if (value < 0 || value > 255) {
8395 PyErr_SetString(PyExc_TypeError,
8396 "character mapping must be in range(256)");
8397 Py_DECREF(x);
8398 return NULL;
8399 }
8400 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008402 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 /* wrong return value */
8406 PyErr_Format(PyExc_TypeError,
8407 "character mapping must return integer, bytes or None, not %.400s",
8408 x->ob_type->tp_name);
8409 Py_DECREF(x);
8410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008411 }
8412}
8413
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008415charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8418 /* exponentially overallocate to minimize reallocations */
8419 if (requiredsize < 2*outsize)
8420 requiredsize = 2*outsize;
8421 if (_PyBytes_Resize(outobj, requiredsize))
8422 return -1;
8423 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008424}
8425
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008428} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008430 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 space is available. Return a new reference to the object that
8432 was put in the output buffer, or Py_None, if the mapping was undefined
8433 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008434 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008435static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008436charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008437 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008439 PyObject *rep;
8440 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008441 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442
Christian Heimes90aa7642007-12-19 02:45:37 +00008443 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008444 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008446 if (res == -1)
8447 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 if (outsize<requiredsize)
8449 if (charmapencode_resize(outobj, outpos, requiredsize))
8450 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008451 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 outstart[(*outpos)++] = (char)res;
8453 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008454 }
8455
8456 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008459 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 Py_DECREF(rep);
8461 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008462 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 if (PyLong_Check(rep)) {
8464 Py_ssize_t requiredsize = *outpos+1;
8465 if (outsize<requiredsize)
8466 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8467 Py_DECREF(rep);
8468 return enc_EXCEPTION;
8469 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008470 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 else {
8474 const char *repchars = PyBytes_AS_STRING(rep);
8475 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8476 Py_ssize_t requiredsize = *outpos+repsize;
8477 if (outsize<requiredsize)
8478 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8479 Py_DECREF(rep);
8480 return enc_EXCEPTION;
8481 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008482 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 memcpy(outstart + *outpos, repchars, repsize);
8484 *outpos += repsize;
8485 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008487 Py_DECREF(rep);
8488 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489}
8490
8491/* handle an error in PyUnicode_EncodeCharmap
8492 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008493static int
8494charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008497 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008498 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499{
8500 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008501 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008502 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008503 enum PyUnicode_Kind kind;
8504 void *data;
8505 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008507 Py_ssize_t collstartpos = *inpos;
8508 Py_ssize_t collendpos = *inpos+1;
8509 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008510 const char *encoding = "charmap";
8511 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008512 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008514 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515
Benjamin Petersonbac79492012-01-14 13:34:47 -05008516 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008517 return -1;
8518 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 /* find all unencodable characters */
8520 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008521 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008522 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008523 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008524 val = encoding_map_lookup(ch, mapping);
8525 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 break;
8527 ++collendpos;
8528 continue;
8529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008531 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8532 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 if (rep==NULL)
8534 return -1;
8535 else if (rep!=Py_None) {
8536 Py_DECREF(rep);
8537 break;
8538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 }
8542 /* cache callback name lookup
8543 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008544 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008545 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008546
8547 switch (*error_handler) {
8548 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008549 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008550 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008551
8552 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008553 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 x = charmapencode_output('?', mapping, res, respos);
8555 if (x==enc_EXCEPTION) {
8556 return -1;
8557 }
8558 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008559 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 return -1;
8561 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008562 }
8563 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008564 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008565 *inpos = collendpos;
8566 break;
Victor Stinner50149202015-09-22 00:26:54 +02008567
8568 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008569 /* generate replacement (temporarily (mis)uses p) */
8570 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 char buffer[2+29+1+1];
8572 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 for (cp = buffer; *cp; ++cp) {
8575 x = charmapencode_output(*cp, mapping, res, respos);
8576 if (x==enc_EXCEPTION)
8577 return -1;
8578 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008579 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 return -1;
8581 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008582 }
8583 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 *inpos = collendpos;
8585 break;
Victor Stinner50149202015-09-22 00:26:54 +02008586
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 default:
Victor Stinner50149202015-09-22 00:26:54 +02008588 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008591 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008593 if (PyBytes_Check(repunicode)) {
8594 /* Directly copy bytes result to output. */
8595 Py_ssize_t outsize = PyBytes_Size(*res);
8596 Py_ssize_t requiredsize;
8597 repsize = PyBytes_Size(repunicode);
8598 requiredsize = *respos + repsize;
8599 if (requiredsize > outsize)
8600 /* Make room for all additional bytes. */
8601 if (charmapencode_resize(res, respos, requiredsize)) {
8602 Py_DECREF(repunicode);
8603 return -1;
8604 }
8605 memcpy(PyBytes_AsString(*res) + *respos,
8606 PyBytes_AsString(repunicode), repsize);
8607 *respos += repsize;
8608 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008609 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008610 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008612 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008613 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008614 Py_DECREF(repunicode);
8615 return -1;
8616 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008617 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008618 data = PyUnicode_DATA(repunicode);
8619 kind = PyUnicode_KIND(repunicode);
8620 for (index = 0; index < repsize; index++) {
8621 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8622 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008624 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 return -1;
8626 }
8627 else if (x==enc_FAILED) {
8628 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008629 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 return -1;
8631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008632 }
8633 *inpos = newpos;
8634 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 }
8636 return 0;
8637}
8638
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008640_PyUnicode_EncodeCharmap(PyObject *unicode,
8641 PyObject *mapping,
8642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 /* output object */
8645 PyObject *res = NULL;
8646 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008647 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008648 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008650 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008651 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008653 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008654 void *data;
8655 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656
Benjamin Petersonbac79492012-01-14 13:34:47 -05008657 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008658 return NULL;
8659 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008660 data = PyUnicode_DATA(unicode);
8661 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008662
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 /* Default to Latin-1 */
8664 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008665 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 /* allocate enough for a simple encoding without
8668 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008669 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 if (res == NULL)
8671 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008672 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008676 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008678 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 if (x==enc_EXCEPTION) /* error */
8680 goto onError;
8681 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008682 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008684 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 &res, &respos)) {
8686 goto onError;
8687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008688 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 else
8690 /* done with this character => adjust input position */
8691 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008695 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008696 if (_PyBytes_Resize(&res, respos) < 0)
8697 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008700 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 return res;
8702
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704 Py_XDECREF(res);
8705 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008706 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 return NULL;
8708}
8709
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008710/* Deprecated */
8711PyObject *
8712PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8713 Py_ssize_t size,
8714 PyObject *mapping,
8715 const char *errors)
8716{
8717 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008718 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008719 if (unicode == NULL)
8720 return NULL;
8721 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8722 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008723 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008724}
8725
Alexander Belopolsky40018472011-02-26 01:02:56 +00008726PyObject *
8727PyUnicode_AsCharmapString(PyObject *unicode,
8728 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729{
8730 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 PyErr_BadArgument();
8732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008734 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735}
8736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008738static void
8739make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741 Py_ssize_t startpos, Py_ssize_t endpos,
8742 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008744 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 *exceptionObject = _PyUnicodeTranslateError_Create(
8746 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 }
8748 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8750 goto onError;
8751 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8752 goto onError;
8753 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8754 goto onError;
8755 return;
8756 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008757 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758 }
8759}
8760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761/* error handling callback helper:
8762 build arguments, call the callback and check the arguments,
8763 put the result into newpos and return the replacement string, which
8764 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008765static PyObject *
8766unicode_translate_call_errorhandler(const char *errors,
8767 PyObject **errorHandler,
8768 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770 Py_ssize_t startpos, Py_ssize_t endpos,
8771 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008773 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008774
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008775 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 PyObject *restuple;
8777 PyObject *resunicode;
8778
8779 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008780 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 }
8784
8785 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008790 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008794 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 Py_DECREF(restuple);
8796 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008797 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008798 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 &resunicode, &i_newpos)) {
8800 Py_DECREF(restuple);
8801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008803 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008805 else
8806 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008808 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 Py_DECREF(restuple);
8810 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008811 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812 Py_INCREF(resunicode);
8813 Py_DECREF(restuple);
8814 return resunicode;
8815}
8816
8817/* Lookup the character ch in the mapping and put the result in result,
8818 which must be decrefed by the caller.
8819 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008820static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822{
Christian Heimes217cfd12007-12-02 14:31:20 +00008823 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824 PyObject *x;
8825
8826 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 x = PyObject_GetItem(mapping, w);
8829 Py_DECREF(w);
8830 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8832 /* No mapping found means: use 1:1 mapping. */
8833 PyErr_Clear();
8834 *result = NULL;
8835 return 0;
8836 } else
8837 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 }
8839 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 *result = x;
8841 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008842 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008843 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008845 if (value < 0 || value > MAX_UNICODE) {
8846 PyErr_Format(PyExc_ValueError,
8847 "character mapping must be in range(0x%x)",
8848 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 Py_DECREF(x);
8850 return -1;
8851 }
8852 *result = x;
8853 return 0;
8854 }
8855 else if (PyUnicode_Check(x)) {
8856 *result = x;
8857 return 0;
8858 }
8859 else {
8860 /* wrong return value */
8861 PyErr_SetString(PyExc_TypeError,
8862 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008863 Py_DECREF(x);
8864 return -1;
8865 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866}
Victor Stinner1194ea02014-04-04 19:37:40 +02008867
8868/* lookup the character, write the result into the writer.
8869 Return 1 if the result was written into the writer, return 0 if the mapping
8870 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008871static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008872charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8873 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008874{
Victor Stinner1194ea02014-04-04 19:37:40 +02008875 PyObject *item;
8876
8877 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008879
8880 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008882 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008885 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008886 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008887
8888 if (item == Py_None) {
8889 Py_DECREF(item);
8890 return 0;
8891 }
8892
8893 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008894 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8895 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8896 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008897 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8898 Py_DECREF(item);
8899 return -1;
8900 }
8901 Py_DECREF(item);
8902 return 1;
8903 }
8904
8905 if (!PyUnicode_Check(item)) {
8906 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008908 }
8909
8910 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8911 Py_DECREF(item);
8912 return -1;
8913 }
8914
8915 Py_DECREF(item);
8916 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008917}
8918
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919static int
8920unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8921 Py_UCS1 *translate)
8922{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008923 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008924 int ret = 0;
8925
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 if (charmaptranslate_lookup(ch, mapping, &item)) {
8927 return -1;
8928 }
8929
8930 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008931 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008932 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008933 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008934 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008935 /* not found => default to 1:1 mapping */
8936 translate[ch] = ch;
8937 return 1;
8938 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008939 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008940 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008941 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8942 used it */
8943 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 /* invalid character or character outside ASCII:
8945 skip the fast translate */
8946 goto exit;
8947 }
8948 translate[ch] = (Py_UCS1)replace;
8949 }
8950 else if (PyUnicode_Check(item)) {
8951 Py_UCS4 replace;
8952
8953 if (PyUnicode_READY(item) == -1) {
8954 Py_DECREF(item);
8955 return -1;
8956 }
8957 if (PyUnicode_GET_LENGTH(item) != 1)
8958 goto exit;
8959
8960 replace = PyUnicode_READ_CHAR(item, 0);
8961 if (replace > 127)
8962 goto exit;
8963 translate[ch] = (Py_UCS1)replace;
8964 }
8965 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008966 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008967 goto exit;
8968 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 ret = 1;
8970
Benjamin Peterson1365de72014-04-07 20:15:41 -04008971 exit:
8972 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973 return ret;
8974}
8975
8976/* Fast path for ascii => ascii translation. Return 1 if the whole string
8977 was translated into writer, return 0 if the input string was partially
8978 translated into writer, raise an exception and return -1 on error. */
8979static int
8980unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008981 _PyUnicodeWriter *writer, int ignore,
8982 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008983{
Victor Stinner872b2912014-04-05 14:27:07 +02008984 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008985 Py_ssize_t len;
8986 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008987 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008988
Victor Stinner89a76ab2014-04-05 11:44:04 +02008989 len = PyUnicode_GET_LENGTH(input);
8990
Victor Stinner872b2912014-04-05 14:27:07 +02008991 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008992
8993 in = PyUnicode_1BYTE_DATA(input);
8994 end = in + len;
8995
8996 assert(PyUnicode_IS_ASCII(writer->buffer));
8997 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8998 out = PyUnicode_1BYTE_DATA(writer->buffer);
8999
Victor Stinner872b2912014-04-05 14:27:07 +02009000 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009001 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009002 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009003 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009004 int translate = unicode_fast_translate_lookup(mapping, ch,
9005 ascii_table);
9006 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009007 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009008 if (translate == 0)
9009 goto exit;
9010 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009011 }
Victor Stinner872b2912014-04-05 14:27:07 +02009012 if (ch2 == 0xfe) {
9013 if (ignore)
9014 continue;
9015 goto exit;
9016 }
9017 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009018 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009019 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009020 }
Victor Stinner872b2912014-04-05 14:27:07 +02009021 res = 1;
9022
9023exit:
9024 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009025 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009026 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027}
9028
Victor Stinner3222da22015-10-01 22:07:32 +02009029static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030_PyUnicode_TranslateCharmap(PyObject *input,
9031 PyObject *mapping,
9032 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 Py_ssize_t size, i;
9037 int kind;
9038 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009039 _PyUnicodeWriter writer;
9040 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009041 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042 PyObject *errorHandler = NULL;
9043 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009044 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009045 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009046
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009048 PyErr_BadArgument();
9049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (PyUnicode_READY(input) == -1)
9053 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009054 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 kind = PyUnicode_KIND(input);
9056 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009058 if (size == 0)
9059 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009061 /* allocate enough for a simple 1:1 translation without
9062 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009063 _PyUnicodeWriter_Init(&writer);
9064 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
Victor Stinner872b2912014-04-05 14:27:07 +02009067 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9068
Victor Stinner33798672016-03-01 21:59:58 +01009069 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009070 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009071 if (PyUnicode_IS_ASCII(input)) {
9072 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9073 if (res < 0) {
9074 _PyUnicodeWriter_Dealloc(&writer);
9075 return NULL;
9076 }
9077 if (res == 1)
9078 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 }
Victor Stinner33798672016-03-01 21:59:58 +01009080 else {
9081 i = 0;
9082 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009086 int translate;
9087 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9088 Py_ssize_t newpos;
9089 /* startpos for collecting untranslatable chars */
9090 Py_ssize_t collstart;
9091 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009092 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093
Victor Stinner1194ea02014-04-04 19:37:40 +02009094 ch = PyUnicode_READ(kind, data, i);
9095 translate = charmaptranslate_output(ch, mapping, &writer);
9096 if (translate < 0)
9097 goto onError;
9098
9099 if (translate != 0) {
9100 /* it worked => adjust input pointer */
9101 ++i;
9102 continue;
9103 }
9104
9105 /* untranslatable character */
9106 collstart = i;
9107 collend = i+1;
9108
9109 /* find all untranslatable characters */
9110 while (collend < size) {
9111 PyObject *x;
9112 ch = PyUnicode_READ(kind, data, collend);
9113 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009114 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009115 Py_XDECREF(x);
9116 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009118 ++collend;
9119 }
9120
9121 if (ignore) {
9122 i = collend;
9123 }
9124 else {
9125 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9126 reason, input, &exc,
9127 collstart, collend, &newpos);
9128 if (repunicode == NULL)
9129 goto onError;
9130 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009132 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 Py_DECREF(repunicode);
9135 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009136 }
9137 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009138 Py_XDECREF(exc);
9139 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009140 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009143 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009144 Py_XDECREF(exc);
9145 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 return NULL;
9147}
9148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149/* Deprecated. Use PyUnicode_Translate instead. */
9150PyObject *
9151PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9152 Py_ssize_t size,
9153 PyObject *mapping,
9154 const char *errors)
9155{
Christian Heimes5f520f42012-09-11 14:03:25 +02009156 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009157 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 if (!unicode)
9159 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009160 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9161 Py_DECREF(unicode);
9162 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163}
9164
Alexander Belopolsky40018472011-02-26 01:02:56 +00009165PyObject *
9166PyUnicode_Translate(PyObject *str,
9167 PyObject *mapping,
9168 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009170 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009171 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009172 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173}
Tim Petersced69f82003-09-16 20:30:58 +00009174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175PyObject *
9176_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9177{
9178 if (!PyUnicode_Check(unicode)) {
9179 PyErr_BadInternalCall();
9180 return NULL;
9181 }
9182 if (PyUnicode_READY(unicode) == -1)
9183 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009184 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 /* If the string is already ASCII, just return the same string */
9186 Py_INCREF(unicode);
9187 return unicode;
9188 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009189
9190 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9191 PyObject *result = PyUnicode_New(len, 127);
9192 if (result == NULL) {
9193 return NULL;
9194 }
9195
9196 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9197 int kind = PyUnicode_KIND(unicode);
9198 const void *data = PyUnicode_DATA(unicode);
9199 Py_ssize_t i;
9200 for (i = 0; i < len; ++i) {
9201 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9202 if (ch < 127) {
9203 out[i] = ch;
9204 }
9205 else if (Py_UNICODE_ISSPACE(ch)) {
9206 out[i] = ' ';
9207 }
9208 else {
9209 int decimal = Py_UNICODE_TODECIMAL(ch);
9210 if (decimal < 0) {
9211 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009212 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009213 _PyUnicode_LENGTH(result) = i + 1;
9214 break;
9215 }
9216 out[i] = '0' + decimal;
9217 }
9218 }
9219
INADA Naoki16dfca42018-07-14 12:06:43 +09009220 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009221 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222}
9223
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009224PyObject *
9225PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9226 Py_ssize_t length)
9227{
Victor Stinnerf0124502011-11-21 23:12:56 +01009228 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009229 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009230 Py_UCS4 maxchar;
9231 enum PyUnicode_Kind kind;
9232 void *data;
9233
Victor Stinner99d7ad02012-02-22 13:37:39 +01009234 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009235 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009236 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009237 if (ch > 127) {
9238 int decimal = Py_UNICODE_TODECIMAL(ch);
9239 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009240 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009241 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009242 }
9243 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009244
9245 /* Copy to a new string */
9246 decimal = PyUnicode_New(length, maxchar);
9247 if (decimal == NULL)
9248 return decimal;
9249 kind = PyUnicode_KIND(decimal);
9250 data = PyUnicode_DATA(decimal);
9251 /* Iterate over code points */
9252 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009253 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009254 if (ch > 127) {
9255 int decimal = Py_UNICODE_TODECIMAL(ch);
9256 if (decimal >= 0)
9257 ch = '0' + decimal;
9258 }
9259 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009261 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009262}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009263/* --- Decimal Encoder ---------------------------------------------------- */
9264
Alexander Belopolsky40018472011-02-26 01:02:56 +00009265int
9266PyUnicode_EncodeDecimal(Py_UNICODE *s,
9267 Py_ssize_t length,
9268 char *output,
9269 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009270{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009271 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009272 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009273 enum PyUnicode_Kind kind;
9274 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009275
9276 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009277 PyErr_BadArgument();
9278 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009279 }
9280
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009281 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009282 if (unicode == NULL)
9283 return -1;
9284
Victor Stinner42bf7752011-11-21 22:52:58 +01009285 kind = PyUnicode_KIND(unicode);
9286 data = PyUnicode_DATA(unicode);
9287
Victor Stinnerb84d7232011-11-22 01:50:07 +01009288 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009289 PyObject *exc;
9290 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009292 Py_ssize_t startpos;
9293
9294 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009295
Benjamin Peterson29060642009-01-31 22:14:21 +00009296 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009297 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009298 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009300 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 decimal = Py_UNICODE_TODECIMAL(ch);
9302 if (decimal >= 0) {
9303 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009304 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 continue;
9306 }
9307 if (0 < ch && ch < 256) {
9308 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009309 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 continue;
9311 }
Victor Stinner6345be92011-11-25 20:09:01 +01009312
Victor Stinner42bf7752011-11-21 22:52:58 +01009313 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009314 exc = NULL;
9315 raise_encode_exception(&exc, "decimal", unicode,
9316 startpos, startpos+1,
9317 "invalid decimal Unicode string");
9318 Py_XDECREF(exc);
9319 Py_DECREF(unicode);
9320 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009321 }
9322 /* 0-terminate the output string */
9323 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009324 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009325 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009326}
9327
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328/* --- Helpers ------------------------------------------------------------ */
9329
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009330/* helper macro to fixup start/end slice values */
9331#define ADJUST_INDICES(start, end, len) \
9332 if (end > len) \
9333 end = len; \
9334 else if (end < 0) { \
9335 end += len; \
9336 if (end < 0) \
9337 end = 0; \
9338 } \
9339 if (start < 0) { \
9340 start += len; \
9341 if (start < 0) \
9342 start = 0; \
9343 }
9344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009346any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009348 Py_ssize_t end,
9349 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009351 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 void *buf1, *buf2;
9353 Py_ssize_t len1, len2, result;
9354
9355 kind1 = PyUnicode_KIND(s1);
9356 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009357 if (kind1 < kind2)
9358 return -1;
9359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 len1 = PyUnicode_GET_LENGTH(s1);
9361 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009362 ADJUST_INDICES(start, end, len1);
9363 if (end - start < len2)
9364 return -1;
9365
9366 buf1 = PyUnicode_DATA(s1);
9367 buf2 = PyUnicode_DATA(s2);
9368 if (len2 == 1) {
9369 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9370 result = findchar((const char *)buf1 + kind1*start,
9371 kind1, end - start, ch, direction);
9372 if (result == -1)
9373 return -1;
9374 else
9375 return start + result;
9376 }
9377
9378 if (kind2 != kind1) {
9379 buf2 = _PyUnicode_AsKind(s2, kind1);
9380 if (!buf2)
9381 return -2;
9382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383
Victor Stinner794d5672011-10-10 03:21:36 +02009384 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009385 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009386 case PyUnicode_1BYTE_KIND:
9387 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9388 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9389 else
9390 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9391 break;
9392 case PyUnicode_2BYTE_KIND:
9393 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9394 break;
9395 case PyUnicode_4BYTE_KIND:
9396 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9397 break;
9398 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009399 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009400 }
9401 }
9402 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009403 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009404 case PyUnicode_1BYTE_KIND:
9405 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9406 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9407 else
9408 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9409 break;
9410 case PyUnicode_2BYTE_KIND:
9411 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9412 break;
9413 case PyUnicode_4BYTE_KIND:
9414 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9415 break;
9416 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009417 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 }
9420
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009421 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 PyMem_Free(buf2);
9423
9424 return result;
9425}
9426
Victor Stinner59423e32018-11-26 13:40:01 +01009427/* _PyUnicode_InsertThousandsGrouping() helper functions */
9428#include "stringlib/localeutil.h"
9429
9430/**
9431 * InsertThousandsGrouping:
9432 * @writer: Unicode writer.
9433 * @n_buffer: Number of characters in @buffer.
9434 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9435 * @d_pos: Start of digits string.
9436 * @n_digits: The number of digits in the string, in which we want
9437 * to put the grouping chars.
9438 * @min_width: The minimum width of the digits in the output string.
9439 * Output will be zero-padded on the left to fill.
9440 * @grouping: see definition in localeconv().
9441 * @thousands_sep: see definition in localeconv().
9442 *
9443 * There are 2 modes: counting and filling. If @writer is NULL,
9444 * we are in counting mode, else filling mode.
9445 * If counting, the required buffer size is returned.
9446 * If filling, we know the buffer will be large enough, so we don't
9447 * need to pass in the buffer size.
9448 * Inserts thousand grouping characters (as defined by grouping and
9449 * thousands_sep) into @writer.
9450 *
9451 * Return value: -1 on error, number of characters otherwise.
9452 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009454_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009455 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009456 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009457 PyObject *digits,
9458 Py_ssize_t d_pos,
9459 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009460 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009461 const char *grouping,
9462 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009463 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464{
Xtreak3f7983a2019-01-07 20:39:14 +05309465 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009466 if (writer) {
9467 assert(digits != NULL);
9468 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009469 }
9470 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009471 assert(digits == NULL);
9472 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009473 }
Victor Stinner59423e32018-11-26 13:40:01 +01009474 assert(0 <= d_pos);
9475 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009476 assert(grouping != NULL);
9477
9478 if (digits != NULL) {
9479 if (PyUnicode_READY(digits) == -1) {
9480 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009481 }
Victor Stinner59423e32018-11-26 13:40:01 +01009482 }
9483 if (PyUnicode_READY(thousands_sep) == -1) {
9484 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009485 }
9486
Victor Stinner59423e32018-11-26 13:40:01 +01009487 Py_ssize_t count = 0;
9488 Py_ssize_t n_zeros;
9489 int loop_broken = 0;
9490 int use_separator = 0; /* First time through, don't append the
9491 separator. They only go between
9492 groups. */
9493 Py_ssize_t buffer_pos;
9494 Py_ssize_t digits_pos;
9495 Py_ssize_t len;
9496 Py_ssize_t n_chars;
9497 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9498 be looked at */
9499 /* A generator that returns all of the grouping widths, until it
9500 returns 0. */
9501 GroupGenerator groupgen;
9502 GroupGenerator_init(&groupgen, grouping);
9503 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9504
9505 /* if digits are not grouped, thousands separator
9506 should be an empty string */
9507 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9508
9509 digits_pos = d_pos + n_digits;
9510 if (writer) {
9511 buffer_pos = writer->pos + n_buffer;
9512 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9513 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 }
Victor Stinner59423e32018-11-26 13:40:01 +01009515 else {
9516 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009517 }
Victor Stinner59423e32018-11-26 13:40:01 +01009518
9519 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009520 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009521 }
Victor Stinner59423e32018-11-26 13:40:01 +01009522
9523 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9524 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9525 n_zeros = Py_MAX(0, len - remaining);
9526 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9527
9528 /* Use n_zero zero's and n_chars chars */
9529
9530 /* Count only, don't do anything. */
9531 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9532
9533 /* Copy into the writer. */
9534 InsertThousandsGrouping_fill(writer, &buffer_pos,
9535 digits, &digits_pos,
9536 n_chars, n_zeros,
9537 use_separator ? thousands_sep : NULL,
9538 thousands_sep_len, maxchar);
9539
9540 /* Use a separator next time. */
9541 use_separator = 1;
9542
9543 remaining -= n_chars;
9544 min_width -= len;
9545
9546 if (remaining <= 0 && min_width <= 0) {
9547 loop_broken = 1;
9548 break;
9549 }
9550 min_width -= thousands_sep_len;
9551 }
9552 if (!loop_broken) {
9553 /* We left the loop without using a break statement. */
9554
9555 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9556 n_zeros = Py_MAX(0, len - remaining);
9557 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9558
9559 /* Use n_zero zero's and n_chars chars */
9560 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9561
9562 /* Copy into the writer. */
9563 InsertThousandsGrouping_fill(writer, &buffer_pos,
9564 digits, &digits_pos,
9565 n_chars, n_zeros,
9566 use_separator ? thousands_sep : NULL,
9567 thousands_sep_len, maxchar);
9568 }
9569 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570}
9571
9572
Alexander Belopolsky40018472011-02-26 01:02:56 +00009573Py_ssize_t
9574PyUnicode_Count(PyObject *str,
9575 PyObject *substr,
9576 Py_ssize_t start,
9577 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009579 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009580 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 void *buf1 = NULL, *buf2 = NULL;
9582 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009583
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009584 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009586
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009587 kind1 = PyUnicode_KIND(str);
9588 kind2 = PyUnicode_KIND(substr);
9589 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009590 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009591
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009592 len1 = PyUnicode_GET_LENGTH(str);
9593 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009595 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009597
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009598 buf1 = PyUnicode_DATA(str);
9599 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009600 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009601 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009602 if (!buf2)
9603 goto onError;
9604 }
9605
9606 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009608 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009609 result = asciilib_count(
9610 ((Py_UCS1*)buf1) + start, end - start,
9611 buf2, len2, PY_SSIZE_T_MAX
9612 );
9613 else
9614 result = ucs1lib_count(
9615 ((Py_UCS1*)buf1) + start, end - start,
9616 buf2, len2, PY_SSIZE_T_MAX
9617 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 break;
9619 case PyUnicode_2BYTE_KIND:
9620 result = ucs2lib_count(
9621 ((Py_UCS2*)buf1) + start, end - start,
9622 buf2, len2, PY_SSIZE_T_MAX
9623 );
9624 break;
9625 case PyUnicode_4BYTE_KIND:
9626 result = ucs4lib_count(
9627 ((Py_UCS4*)buf1) + start, end - start,
9628 buf2, len2, PY_SSIZE_T_MAX
9629 );
9630 break;
9631 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009632 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009634
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009635 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 PyMem_Free(buf2);
9637
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009640 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 PyMem_Free(buf2);
9642 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643}
9644
Alexander Belopolsky40018472011-02-26 01:02:56 +00009645Py_ssize_t
9646PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009647 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009648 Py_ssize_t start,
9649 Py_ssize_t end,
9650 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009652 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009653 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009654
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009655 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656}
9657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658Py_ssize_t
9659PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9660 Py_ssize_t start, Py_ssize_t end,
9661 int direction)
9662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009664 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 if (PyUnicode_READY(str) == -1)
9666 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009667 len = PyUnicode_GET_LENGTH(str);
9668 ADJUST_INDICES(start, end, len);
9669 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009670 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009672 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9673 kind, end-start, ch, direction);
9674 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009676 else
9677 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678}
9679
Alexander Belopolsky40018472011-02-26 01:02:56 +00009680static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009681tailmatch(PyObject *self,
9682 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009683 Py_ssize_t start,
9684 Py_ssize_t end,
9685 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 int kind_self;
9688 int kind_sub;
9689 void *data_self;
9690 void *data_sub;
9691 Py_ssize_t offset;
9692 Py_ssize_t i;
9693 Py_ssize_t end_sub;
9694
9695 if (PyUnicode_READY(self) == -1 ||
9696 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009697 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9700 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009704 if (PyUnicode_GET_LENGTH(substring) == 0)
9705 return 1;
9706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 kind_self = PyUnicode_KIND(self);
9708 data_self = PyUnicode_DATA(self);
9709 kind_sub = PyUnicode_KIND(substring);
9710 data_sub = PyUnicode_DATA(substring);
9711 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9712
9713 if (direction > 0)
9714 offset = end;
9715 else
9716 offset = start;
9717
9718 if (PyUnicode_READ(kind_self, data_self, offset) ==
9719 PyUnicode_READ(kind_sub, data_sub, 0) &&
9720 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9721 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9722 /* If both are of the same kind, memcmp is sufficient */
9723 if (kind_self == kind_sub) {
9724 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009725 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 data_sub,
9727 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009728 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009730 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 else {
9732 /* We do not need to compare 0 and len(substring)-1 because
9733 the if statement above ensured already that they are equal
9734 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 for (i = 1; i < end_sub; ++i) {
9736 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9737 PyUnicode_READ(kind_sub, data_sub, i))
9738 return 0;
9739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009740 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742 }
9743
9744 return 0;
9745}
9746
Alexander Belopolsky40018472011-02-26 01:02:56 +00009747Py_ssize_t
9748PyUnicode_Tailmatch(PyObject *str,
9749 PyObject *substr,
9750 Py_ssize_t start,
9751 Py_ssize_t end,
9752 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009754 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009755 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009756
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009757 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758}
9759
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760static PyObject *
9761ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9764 char *resdata, *data = PyUnicode_DATA(self);
9765 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009766
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 res = PyUnicode_New(len, 127);
9768 if (res == NULL)
9769 return NULL;
9770 resdata = PyUnicode_DATA(res);
9771 if (lower)
9772 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009774 _Py_bytes_upper(resdata, data, len);
9775 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776}
9777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009779handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009781 Py_ssize_t j;
9782 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009783 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009784 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009785
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009786 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9787
9788 where ! is a negation and \p{xxx} is a character with property xxx.
9789 */
9790 for (j = i - 1; j >= 0; j--) {
9791 c = PyUnicode_READ(kind, data, j);
9792 if (!_PyUnicode_IsCaseIgnorable(c))
9793 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9796 if (final_sigma) {
9797 for (j = i + 1; j < length; j++) {
9798 c = PyUnicode_READ(kind, data, j);
9799 if (!_PyUnicode_IsCaseIgnorable(c))
9800 break;
9801 }
9802 final_sigma = j == length || !_PyUnicode_IsCased(c);
9803 }
9804 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805}
9806
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009807static int
9808lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9809 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009811 /* Obscure special case. */
9812 if (c == 0x3A3) {
9813 mapped[0] = handle_capital_sigma(kind, data, length, i);
9814 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817}
9818
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009819static Py_ssize_t
9820do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 Py_ssize_t i, k = 0;
9823 int n_res, j;
9824 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009825
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009827 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009829 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009830 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009832 for (i = 1; i < length; i++) {
9833 c = PyUnicode_READ(kind, data, i);
9834 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9835 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009836 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009838 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009839 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009840 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841}
9842
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009843static Py_ssize_t
9844do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9845 Py_ssize_t i, k = 0;
9846
9847 for (i = 0; i < length; i++) {
9848 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9849 int n_res, j;
9850 if (Py_UNICODE_ISUPPER(c)) {
9851 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9852 }
9853 else if (Py_UNICODE_ISLOWER(c)) {
9854 n_res = _PyUnicode_ToUpperFull(c, mapped);
9855 }
9856 else {
9857 n_res = 1;
9858 mapped[0] = c;
9859 }
9860 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009861 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009862 res[k++] = mapped[j];
9863 }
9864 }
9865 return k;
9866}
9867
9868static Py_ssize_t
9869do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9870 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 Py_ssize_t i, k = 0;
9873
9874 for (i = 0; i < length; i++) {
9875 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9876 int n_res, j;
9877 if (lower)
9878 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9879 else
9880 n_res = _PyUnicode_ToUpperFull(c, mapped);
9881 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009882 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883 res[k++] = mapped[j];
9884 }
9885 }
9886 return k;
9887}
9888
9889static Py_ssize_t
9890do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9891{
9892 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9893}
9894
9895static Py_ssize_t
9896do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9897{
9898 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9899}
9900
Benjamin Petersone51757f2012-01-12 21:10:29 -05009901static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009902do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9903{
9904 Py_ssize_t i, k = 0;
9905
9906 for (i = 0; i < length; i++) {
9907 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9908 Py_UCS4 mapped[3];
9909 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9910 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009911 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009912 res[k++] = mapped[j];
9913 }
9914 }
9915 return k;
9916}
9917
9918static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009919do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9920{
9921 Py_ssize_t i, k = 0;
9922 int previous_is_cased;
9923
9924 previous_is_cased = 0;
9925 for (i = 0; i < length; i++) {
9926 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9927 Py_UCS4 mapped[3];
9928 int n_res, j;
9929
9930 if (previous_is_cased)
9931 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9932 else
9933 n_res = _PyUnicode_ToTitleFull(c, mapped);
9934
9935 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009936 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009937 res[k++] = mapped[j];
9938 }
9939
9940 previous_is_cased = _PyUnicode_IsCased(c);
9941 }
9942 return k;
9943}
9944
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009945static PyObject *
9946case_operation(PyObject *self,
9947 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9948{
9949 PyObject *res = NULL;
9950 Py_ssize_t length, newlength = 0;
9951 int kind, outkind;
9952 void *data, *outdata;
9953 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9954
Benjamin Petersoneea48462012-01-16 14:28:50 -05009955 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009956
9957 kind = PyUnicode_KIND(self);
9958 data = PyUnicode_DATA(self);
9959 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009960 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009961 PyErr_SetString(PyExc_OverflowError, "string is too long");
9962 return NULL;
9963 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009964 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009965 if (tmp == NULL)
9966 return PyErr_NoMemory();
9967 newlength = perform(kind, data, length, tmp, &maxchar);
9968 res = PyUnicode_New(newlength, maxchar);
9969 if (res == NULL)
9970 goto leave;
9971 tmpend = tmp + newlength;
9972 outdata = PyUnicode_DATA(res);
9973 outkind = PyUnicode_KIND(res);
9974 switch (outkind) {
9975 case PyUnicode_1BYTE_KIND:
9976 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9977 break;
9978 case PyUnicode_2BYTE_KIND:
9979 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9980 break;
9981 case PyUnicode_4BYTE_KIND:
9982 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9983 break;
9984 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009985 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009986 }
9987 leave:
9988 PyMem_FREE(tmp);
9989 return res;
9990}
9991
Tim Peters8ce9f162004-08-27 01:49:32 +00009992PyObject *
9993PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009995 PyObject *res;
9996 PyObject *fseq;
9997 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010000 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010001 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010002 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010003 }
10004
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010005 /* NOTE: the following code can't call back into Python code,
10006 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010007 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010008
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010009 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010010 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010011 res = _PyUnicode_JoinArray(separator, items, seqlen);
10012 Py_DECREF(fseq);
10013 return res;
10014}
10015
10016PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010017_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010018{
10019 PyObject *res = NULL; /* the result */
10020 PyObject *sep = NULL;
10021 Py_ssize_t seplen;
10022 PyObject *item;
10023 Py_ssize_t sz, i, res_offset;
10024 Py_UCS4 maxchar;
10025 Py_UCS4 item_maxchar;
10026 int use_memcpy;
10027 unsigned char *res_data = NULL, *sep_data = NULL;
10028 PyObject *last_obj;
10029 unsigned int kind = 0;
10030
Tim Peters05eba1f2004-08-27 21:32:02 +000010031 /* If empty sequence, return u"". */
10032 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010033 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010034 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010035
Tim Peters05eba1f2004-08-27 21:32:02 +000010036 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010037 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010038 if (seqlen == 1) {
10039 if (PyUnicode_CheckExact(items[0])) {
10040 res = items[0];
10041 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010042 return res;
10043 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010044 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010045 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010046 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010047 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010048 /* Set up sep and seplen */
10049 if (separator == NULL) {
10050 /* fall back to a blank space separator */
10051 sep = PyUnicode_FromOrdinal(' ');
10052 if (!sep)
10053 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010054 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010055 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010056 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010057 else {
10058 if (!PyUnicode_Check(separator)) {
10059 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010060 "separator: expected str instance,"
10061 " %.80s found",
10062 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010063 goto onError;
10064 }
10065 if (PyUnicode_READY(separator))
10066 goto onError;
10067 sep = separator;
10068 seplen = PyUnicode_GET_LENGTH(separator);
10069 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10070 /* inc refcount to keep this code path symmetric with the
10071 above case of a blank separator */
10072 Py_INCREF(sep);
10073 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010074 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010075 }
10076
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010077 /* There are at least two things to join, or else we have a subclass
10078 * of str in the sequence.
10079 * Do a pre-pass to figure out the total amount of space we'll
10080 * need (sz), and see whether all argument are strings.
10081 */
10082 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010083#ifdef Py_DEBUG
10084 use_memcpy = 0;
10085#else
10086 use_memcpy = 1;
10087#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010088 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010089 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010090 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010091 if (!PyUnicode_Check(item)) {
10092 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010093 "sequence item %zd: expected str instance,"
10094 " %.80s found",
10095 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010096 goto onError;
10097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 if (PyUnicode_READY(item) == -1)
10099 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010100 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010102 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010103 if (i != 0) {
10104 add_sz += seplen;
10105 }
10106 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010107 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010109 goto onError;
10110 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010111 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010112 if (use_memcpy && last_obj != NULL) {
10113 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10114 use_memcpy = 0;
10115 }
10116 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010117 }
Tim Petersced69f82003-09-16 20:30:58 +000010118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010120 if (res == NULL)
10121 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010122
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010123 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010124#ifdef Py_DEBUG
10125 use_memcpy = 0;
10126#else
10127 if (use_memcpy) {
10128 res_data = PyUnicode_1BYTE_DATA(res);
10129 kind = PyUnicode_KIND(res);
10130 if (seplen != 0)
10131 sep_data = PyUnicode_1BYTE_DATA(sep);
10132 }
10133#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010134 if (use_memcpy) {
10135 for (i = 0; i < seqlen; ++i) {
10136 Py_ssize_t itemlen;
10137 item = items[i];
10138
10139 /* Copy item, and maybe the separator. */
10140 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010141 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010142 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010143 kind * seplen);
10144 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010145 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010146
10147 itemlen = PyUnicode_GET_LENGTH(item);
10148 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010149 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010150 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 kind * itemlen);
10152 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010153 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010154 }
10155 assert(res_data == PyUnicode_1BYTE_DATA(res)
10156 + kind * PyUnicode_GET_LENGTH(res));
10157 }
10158 else {
10159 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10160 Py_ssize_t itemlen;
10161 item = items[i];
10162
10163 /* Copy item, and maybe the separator. */
10164 if (i && seplen != 0) {
10165 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10166 res_offset += seplen;
10167 }
10168
10169 itemlen = PyUnicode_GET_LENGTH(item);
10170 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010171 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010172 res_offset += itemlen;
10173 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010174 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010175 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010176 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010179 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Benjamin Peterson29060642009-01-31 22:14:21 +000010182 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010184 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185 return NULL;
10186}
10187
Victor Stinnerd3f08822012-05-29 12:57:52 +020010188void
10189_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10190 Py_UCS4 fill_char)
10191{
10192 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010193 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010194 assert(PyUnicode_IS_READY(unicode));
10195 assert(unicode_modifiable(unicode));
10196 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10197 assert(start >= 0);
10198 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010199 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010200}
10201
Victor Stinner3fe55312012-01-04 00:33:50 +010010202Py_ssize_t
10203PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10204 Py_UCS4 fill_char)
10205{
10206 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010207
10208 if (!PyUnicode_Check(unicode)) {
10209 PyErr_BadInternalCall();
10210 return -1;
10211 }
10212 if (PyUnicode_READY(unicode) == -1)
10213 return -1;
10214 if (unicode_check_modifiable(unicode))
10215 return -1;
10216
Victor Stinnerd3f08822012-05-29 12:57:52 +020010217 if (start < 0) {
10218 PyErr_SetString(PyExc_IndexError, "string index out of range");
10219 return -1;
10220 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010221 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10222 PyErr_SetString(PyExc_ValueError,
10223 "fill character is bigger than "
10224 "the string maximum character");
10225 return -1;
10226 }
10227
10228 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10229 length = Py_MIN(maxlen, length);
10230 if (length <= 0)
10231 return 0;
10232
Victor Stinnerd3f08822012-05-29 12:57:52 +020010233 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010234 return length;
10235}
10236
Victor Stinner9310abb2011-10-05 00:59:23 +020010237static PyObject *
10238pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010239 Py_ssize_t left,
10240 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 PyObject *u;
10244 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010245 int kind;
10246 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
10248 if (left < 0)
10249 left = 0;
10250 if (right < 0)
10251 right = 0;
10252
Victor Stinnerc4b49542011-12-11 22:44:26 +010010253 if (left == 0 && right == 0)
10254 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10257 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010258 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10259 return NULL;
10260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010262 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010264 if (!u)
10265 return NULL;
10266
10267 kind = PyUnicode_KIND(u);
10268 data = PyUnicode_DATA(u);
10269 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010270 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010271 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010272 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010273 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010274 assert(_PyUnicode_CheckConsistency(u, 1));
10275 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276}
10277
Alexander Belopolsky40018472011-02-26 01:02:56 +000010278PyObject *
10279PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010283 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285
Benjamin Petersonead6b532011-12-20 17:23:42 -060010286 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 if (PyUnicode_IS_ASCII(string))
10289 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 PyUnicode_GET_LENGTH(string), keepends);
10292 else
10293 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 break;
10297 case PyUnicode_2BYTE_KIND:
10298 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 PyUnicode_GET_LENGTH(string), keepends);
10301 break;
10302 case PyUnicode_4BYTE_KIND:
10303 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 PyUnicode_GET_LENGTH(string), keepends);
10306 break;
10307 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010308 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311}
10312
Alexander Belopolsky40018472011-02-26 01:02:56 +000010313static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010314split(PyObject *self,
10315 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010316 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010318 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 void *buf1, *buf2;
10320 Py_ssize_t len1, len2;
10321 PyObject* out;
10322
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010324 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 if (PyUnicode_READY(self) == -1)
10327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010330 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 if (PyUnicode_IS_ASCII(self))
10333 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010335 PyUnicode_GET_LENGTH(self), maxcount
10336 );
10337 else
10338 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010340 PyUnicode_GET_LENGTH(self), maxcount
10341 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 case PyUnicode_2BYTE_KIND:
10343 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 PyUnicode_GET_LENGTH(self), maxcount
10346 );
10347 case PyUnicode_4BYTE_KIND:
10348 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 PyUnicode_GET_LENGTH(self), maxcount
10351 );
10352 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010353 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 }
10355
10356 if (PyUnicode_READY(substring) == -1)
10357 return NULL;
10358
10359 kind1 = PyUnicode_KIND(self);
10360 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 len1 = PyUnicode_GET_LENGTH(self);
10362 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010363 if (kind1 < kind2 || len1 < len2) {
10364 out = PyList_New(1);
10365 if (out == NULL)
10366 return NULL;
10367 Py_INCREF(self);
10368 PyList_SET_ITEM(out, 0, self);
10369 return out;
10370 }
10371 buf1 = PyUnicode_DATA(self);
10372 buf2 = PyUnicode_DATA(substring);
10373 if (kind2 != kind1) {
10374 buf2 = _PyUnicode_AsKind(substring, kind1);
10375 if (!buf2)
10376 return NULL;
10377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010379 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010381 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10382 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010383 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010384 else
10385 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 case PyUnicode_2BYTE_KIND:
10389 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 case PyUnicode_4BYTE_KIND:
10393 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010394 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 break;
10396 default:
10397 out = NULL;
10398 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010399 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 PyMem_Free(buf2);
10401 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402}
10403
Alexander Belopolsky40018472011-02-26 01:02:56 +000010404static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010405rsplit(PyObject *self,
10406 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010407 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010408{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010409 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 void *buf1, *buf2;
10411 Py_ssize_t len1, len2;
10412 PyObject* out;
10413
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010414 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010415 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 if (PyUnicode_READY(self) == -1)
10418 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010421 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423 if (PyUnicode_IS_ASCII(self))
10424 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010425 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 PyUnicode_GET_LENGTH(self), maxcount
10427 );
10428 else
10429 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010430 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 PyUnicode_GET_LENGTH(self), maxcount
10432 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 case PyUnicode_2BYTE_KIND:
10434 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010435 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 PyUnicode_GET_LENGTH(self), maxcount
10437 );
10438 case PyUnicode_4BYTE_KIND:
10439 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010440 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 PyUnicode_GET_LENGTH(self), maxcount
10442 );
10443 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010444 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 }
10446
10447 if (PyUnicode_READY(substring) == -1)
10448 return NULL;
10449
10450 kind1 = PyUnicode_KIND(self);
10451 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 len1 = PyUnicode_GET_LENGTH(self);
10453 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010454 if (kind1 < kind2 || len1 < len2) {
10455 out = PyList_New(1);
10456 if (out == NULL)
10457 return NULL;
10458 Py_INCREF(self);
10459 PyList_SET_ITEM(out, 0, self);
10460 return out;
10461 }
10462 buf1 = PyUnicode_DATA(self);
10463 buf2 = PyUnicode_DATA(substring);
10464 if (kind2 != kind1) {
10465 buf2 = _PyUnicode_AsKind(substring, kind1);
10466 if (!buf2)
10467 return NULL;
10468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010470 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010472 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10473 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010474 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010475 else
10476 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 break;
10479 case PyUnicode_2BYTE_KIND:
10480 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 break;
10483 case PyUnicode_4BYTE_KIND:
10484 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010485 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 break;
10487 default:
10488 out = NULL;
10489 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010490 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 PyMem_Free(buf2);
10492 return out;
10493}
10494
10495static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010496anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10497 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010499 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010501 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10502 return asciilib_find(buf1, len1, buf2, len2, offset);
10503 else
10504 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 case PyUnicode_2BYTE_KIND:
10506 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10507 case PyUnicode_4BYTE_KIND:
10508 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10509 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010510 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511}
10512
10513static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010514anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10515 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010517 switch (kind) {
10518 case PyUnicode_1BYTE_KIND:
10519 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10520 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10521 else
10522 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10523 case PyUnicode_2BYTE_KIND:
10524 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10525 case PyUnicode_4BYTE_KIND:
10526 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10527 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010528 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010529}
10530
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010531static void
10532replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10533 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10534{
10535 int kind = PyUnicode_KIND(u);
10536 void *data = PyUnicode_DATA(u);
10537 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10538 if (kind == PyUnicode_1BYTE_KIND) {
10539 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10540 (Py_UCS1 *)data + len,
10541 u1, u2, maxcount);
10542 }
10543 else if (kind == PyUnicode_2BYTE_KIND) {
10544 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10545 (Py_UCS2 *)data + len,
10546 u1, u2, maxcount);
10547 }
10548 else {
10549 assert(kind == PyUnicode_4BYTE_KIND);
10550 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10551 (Py_UCS4 *)data + len,
10552 u1, u2, maxcount);
10553 }
10554}
10555
Alexander Belopolsky40018472011-02-26 01:02:56 +000010556static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557replace(PyObject *self, PyObject *str1,
10558 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 PyObject *u;
10561 char *sbuf = PyUnicode_DATA(self);
10562 char *buf1 = PyUnicode_DATA(str1);
10563 char *buf2 = PyUnicode_DATA(str2);
10564 int srelease = 0, release1 = 0, release2 = 0;
10565 int skind = PyUnicode_KIND(self);
10566 int kind1 = PyUnicode_KIND(str1);
10567 int kind2 = PyUnicode_KIND(str2);
10568 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10569 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10570 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010572 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573
10574 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010577 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578
Victor Stinner59de0ee2011-10-07 10:01:28 +020010579 if (str1 == str2)
10580 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010583 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10584 if (maxchar < maxchar_str1)
10585 /* substring too wide to be present */
10586 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010587 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10588 /* Replacing str1 with str2 may cause a maxchar reduction in the
10589 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010590 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010591 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010594 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010596 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010599 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010600 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010601
Victor Stinner69ed0f42013-04-09 21:48:24 +020010602 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010603 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010604 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010606 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010610
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010611 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10612 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 }
10614 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 int rkind = skind;
10616 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010617 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 if (kind1 < rkind) {
10620 /* widen substring */
10621 buf1 = _PyUnicode_AsKind(str1, rkind);
10622 if (!buf1) goto error;
10623 release1 = 1;
10624 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010625 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010626 if (i < 0)
10627 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (rkind > kind2) {
10629 /* widen replacement */
10630 buf2 = _PyUnicode_AsKind(str2, rkind);
10631 if (!buf2) goto error;
10632 release2 = 1;
10633 }
10634 else if (rkind < kind2) {
10635 /* widen self and buf1 */
10636 rkind = kind2;
10637 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010638 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 sbuf = _PyUnicode_AsKind(self, rkind);
10640 if (!sbuf) goto error;
10641 srelease = 1;
10642 buf1 = _PyUnicode_AsKind(str1, rkind);
10643 if (!buf1) goto error;
10644 release1 = 1;
10645 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 u = PyUnicode_New(slen, maxchar);
10647 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010649 assert(PyUnicode_KIND(u) == rkind);
10650 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010651
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010652 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010653 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010658
10659 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010660 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010662 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010663 if (i == -1)
10664 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010671 }
10672 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010674 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 int rkind = skind;
10676 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010679 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 buf1 = _PyUnicode_AsKind(str1, rkind);
10681 if (!buf1) goto error;
10682 release1 = 1;
10683 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010684 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685 if (n == 0)
10686 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010688 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 buf2 = _PyUnicode_AsKind(str2, rkind);
10690 if (!buf2) goto error;
10691 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010694 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 rkind = kind2;
10696 sbuf = _PyUnicode_AsKind(self, rkind);
10697 if (!sbuf) goto error;
10698 srelease = 1;
10699 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010700 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 buf1 = _PyUnicode_AsKind(str1, rkind);
10702 if (!buf1) goto error;
10703 release1 = 1;
10704 }
10705 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10706 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010707 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 PyErr_SetString(PyExc_OverflowError,
10709 "replace string is too long");
10710 goto error;
10711 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010712 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010713 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010714 _Py_INCREF_UNICODE_EMPTY();
10715 if (!unicode_empty)
10716 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010717 u = unicode_empty;
10718 goto done;
10719 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010720 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 PyErr_SetString(PyExc_OverflowError,
10722 "replace string is too long");
10723 goto error;
10724 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010725 u = PyUnicode_New(new_size, maxchar);
10726 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010728 assert(PyUnicode_KIND(u) == rkind);
10729 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 ires = i = 0;
10731 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 while (n-- > 0) {
10733 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010734 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010735 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010736 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010737 if (j == -1)
10738 break;
10739 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010740 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010741 memcpy(res + rkind * ires,
10742 sbuf + rkind * i,
10743 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010745 }
10746 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010748 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010750 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010756 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010757 memcpy(res + rkind * ires,
10758 sbuf + rkind * i,
10759 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010760 }
10761 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762 /* interleave */
10763 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010764 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010766 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010768 if (--n <= 0)
10769 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010770 memcpy(res + rkind * ires,
10771 sbuf + rkind * i,
10772 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 ires++;
10774 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010775 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010776 memcpy(res + rkind * ires,
10777 sbuf + rkind * i,
10778 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010779 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010780 }
10781
10782 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010783 unicode_adjust_maxchar(&u);
10784 if (u == NULL)
10785 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010787
10788 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 if (srelease)
10790 PyMem_FREE(sbuf);
10791 if (release1)
10792 PyMem_FREE(buf1);
10793 if (release2)
10794 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010795 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010797
Benjamin Peterson29060642009-01-31 22:14:21 +000010798 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010799 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 if (srelease)
10801 PyMem_FREE(sbuf);
10802 if (release1)
10803 PyMem_FREE(buf1);
10804 if (release2)
10805 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010806 return unicode_result_unchanged(self);
10807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 error:
10809 if (srelease && sbuf)
10810 PyMem_FREE(sbuf);
10811 if (release1 && buf1)
10812 PyMem_FREE(buf1);
10813 if (release2 && buf2)
10814 PyMem_FREE(buf2);
10815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816}
10817
10818/* --- Unicode Object Methods --------------------------------------------- */
10819
INADA Naoki3ae20562017-01-16 20:41:20 +090010820/*[clinic input]
10821str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822
INADA Naoki3ae20562017-01-16 20:41:20 +090010823Return a version of the string where each word is titlecased.
10824
10825More specifically, words start with uppercased characters and all remaining
10826cased characters have lower case.
10827[clinic start generated code]*/
10828
10829static PyObject *
10830unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010831/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010833 if (PyUnicode_READY(self) == -1)
10834 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010835 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836}
10837
INADA Naoki3ae20562017-01-16 20:41:20 +090010838/*[clinic input]
10839str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
INADA Naoki3ae20562017-01-16 20:41:20 +090010841Return a capitalized version of the string.
10842
10843More specifically, make the first character have upper case and the rest lower
10844case.
10845[clinic start generated code]*/
10846
10847static PyObject *
10848unicode_capitalize_impl(PyObject *self)
10849/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010851 if (PyUnicode_READY(self) == -1)
10852 return NULL;
10853 if (PyUnicode_GET_LENGTH(self) == 0)
10854 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010855 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856}
10857
INADA Naoki3ae20562017-01-16 20:41:20 +090010858/*[clinic input]
10859str.casefold as unicode_casefold
10860
10861Return a version of the string suitable for caseless comparisons.
10862[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010863
10864static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010865unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010866/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010867{
10868 if (PyUnicode_READY(self) == -1)
10869 return NULL;
10870 if (PyUnicode_IS_ASCII(self))
10871 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010872 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010873}
10874
10875
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010876/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010877
10878static int
10879convert_uc(PyObject *obj, void *addr)
10880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010882
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010883 if (!PyUnicode_Check(obj)) {
10884 PyErr_Format(PyExc_TypeError,
10885 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010886 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010887 return 0;
10888 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010889 if (PyUnicode_READY(obj) < 0)
10890 return 0;
10891 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010893 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010894 return 0;
10895 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010896 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010897 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010898}
10899
INADA Naoki3ae20562017-01-16 20:41:20 +090010900/*[clinic input]
10901str.center as unicode_center
10902
10903 width: Py_ssize_t
10904 fillchar: Py_UCS4 = ' '
10905 /
10906
10907Return a centered string of length width.
10908
10909Padding is done using the specified fill character (default is a space).
10910[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
10912static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010913unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10914/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010916 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
Benjamin Petersonbac79492012-01-14 13:34:47 -050010918 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919 return NULL;
10920
Victor Stinnerc4b49542011-12-11 22:44:26 +010010921 if (PyUnicode_GET_LENGTH(self) >= width)
10922 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
Victor Stinnerc4b49542011-12-11 22:44:26 +010010924 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 left = marg / 2 + (marg & width & 1);
10926
Victor Stinner9310abb2011-10-05 00:59:23 +020010927 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928}
10929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930/* This function assumes that str1 and str2 are readied by the caller. */
10931
Marc-André Lemburge5034372000-08-08 08:04:29 +000010932static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010933unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010934{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010935#define COMPARE(TYPE1, TYPE2) \
10936 do { \
10937 TYPE1* p1 = (TYPE1 *)data1; \
10938 TYPE2* p2 = (TYPE2 *)data2; \
10939 TYPE1* end = p1 + len; \
10940 Py_UCS4 c1, c2; \
10941 for (; p1 != end; p1++, p2++) { \
10942 c1 = *p1; \
10943 c2 = *p2; \
10944 if (c1 != c2) \
10945 return (c1 < c2) ? -1 : 1; \
10946 } \
10947 } \
10948 while (0)
10949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 int kind1, kind2;
10951 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 kind1 = PyUnicode_KIND(str1);
10955 kind2 = PyUnicode_KIND(str2);
10956 data1 = PyUnicode_DATA(str1);
10957 data2 = PyUnicode_DATA(str2);
10958 len1 = PyUnicode_GET_LENGTH(str1);
10959 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010960 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010962 switch(kind1) {
10963 case PyUnicode_1BYTE_KIND:
10964 {
10965 switch(kind2) {
10966 case PyUnicode_1BYTE_KIND:
10967 {
10968 int cmp = memcmp(data1, data2, len);
10969 /* normalize result of memcmp() into the range [-1; 1] */
10970 if (cmp < 0)
10971 return -1;
10972 if (cmp > 0)
10973 return 1;
10974 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010975 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010976 case PyUnicode_2BYTE_KIND:
10977 COMPARE(Py_UCS1, Py_UCS2);
10978 break;
10979 case PyUnicode_4BYTE_KIND:
10980 COMPARE(Py_UCS1, Py_UCS4);
10981 break;
10982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010983 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010984 }
10985 break;
10986 }
10987 case PyUnicode_2BYTE_KIND:
10988 {
10989 switch(kind2) {
10990 case PyUnicode_1BYTE_KIND:
10991 COMPARE(Py_UCS2, Py_UCS1);
10992 break;
10993 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010994 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010995 COMPARE(Py_UCS2, Py_UCS2);
10996 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010997 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010998 case PyUnicode_4BYTE_KIND:
10999 COMPARE(Py_UCS2, Py_UCS4);
11000 break;
11001 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011002 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011003 }
11004 break;
11005 }
11006 case PyUnicode_4BYTE_KIND:
11007 {
11008 switch(kind2) {
11009 case PyUnicode_1BYTE_KIND:
11010 COMPARE(Py_UCS4, Py_UCS1);
11011 break;
11012 case PyUnicode_2BYTE_KIND:
11013 COMPARE(Py_UCS4, Py_UCS2);
11014 break;
11015 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011016 {
11017#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11018 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11019 /* normalize result of wmemcmp() into the range [-1; 1] */
11020 if (cmp < 0)
11021 return -1;
11022 if (cmp > 0)
11023 return 1;
11024#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011025 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011026#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011027 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011028 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011029 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011030 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011031 }
11032 break;
11033 }
11034 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011035 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011036 }
11037
Victor Stinner770e19e2012-10-04 22:59:45 +020011038 if (len1 == len2)
11039 return 0;
11040 if (len1 < len2)
11041 return -1;
11042 else
11043 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011044
11045#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011046}
11047
Benjamin Peterson621b4302016-09-09 13:54:34 -070011048static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011049unicode_compare_eq(PyObject *str1, PyObject *str2)
11050{
11051 int kind;
11052 void *data1, *data2;
11053 Py_ssize_t len;
11054 int cmp;
11055
Victor Stinnere5567ad2012-10-23 02:48:49 +020011056 len = PyUnicode_GET_LENGTH(str1);
11057 if (PyUnicode_GET_LENGTH(str2) != len)
11058 return 0;
11059 kind = PyUnicode_KIND(str1);
11060 if (PyUnicode_KIND(str2) != kind)
11061 return 0;
11062 data1 = PyUnicode_DATA(str1);
11063 data2 = PyUnicode_DATA(str2);
11064
11065 cmp = memcmp(data1, data2, len * kind);
11066 return (cmp == 0);
11067}
11068
11069
Alexander Belopolsky40018472011-02-26 01:02:56 +000011070int
11071PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11074 if (PyUnicode_READY(left) == -1 ||
11075 PyUnicode_READY(right) == -1)
11076 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011077
11078 /* a string is equal to itself */
11079 if (left == right)
11080 return 0;
11081
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011082 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011084 PyErr_Format(PyExc_TypeError,
11085 "Can't compare %.100s and %.100s",
11086 left->ob_type->tp_name,
11087 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 return -1;
11089}
11090
Martin v. Löwis5b222132007-06-10 09:51:05 +000011091int
11092PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 Py_ssize_t i;
11095 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011097 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098
Victor Stinner910337b2011-10-03 03:20:16 +020011099 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011100 if (!PyUnicode_IS_READY(uni)) {
11101 const wchar_t *ws = _PyUnicode_WSTR(uni);
11102 /* Compare Unicode string and source character set string */
11103 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11104 if (chr != ustr[i])
11105 return (chr < ustr[i]) ? -1 : 1;
11106 }
11107 /* This check keeps Python strings that end in '\0' from comparing equal
11108 to C strings identical up to that point. */
11109 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11110 return 1; /* uni is longer */
11111 if (ustr[i])
11112 return -1; /* str is longer */
11113 return 0;
11114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011116 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011117 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011118 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011119 size_t len, len2 = strlen(str);
11120 int cmp;
11121
11122 len = Py_MIN(len1, len2);
11123 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011124 if (cmp != 0) {
11125 if (cmp < 0)
11126 return -1;
11127 else
11128 return 1;
11129 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011130 if (len1 > len2)
11131 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011132 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011133 return -1; /* str is longer */
11134 return 0;
11135 }
11136 else {
11137 void *data = PyUnicode_DATA(uni);
11138 /* Compare Unicode string and source character set string */
11139 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011140 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011141 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11142 /* This check keeps Python strings that end in '\0' from comparing equal
11143 to C strings identical up to that point. */
11144 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11145 return 1; /* uni is longer */
11146 if (str[i])
11147 return -1; /* str is longer */
11148 return 0;
11149 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011150}
11151
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011152static int
11153non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11154{
11155 size_t i, len;
11156 const wchar_t *p;
11157 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11158 if (strlen(str) != len)
11159 return 0;
11160 p = _PyUnicode_WSTR(unicode);
11161 assert(p);
11162 for (i = 0; i < len; i++) {
11163 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011164 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011165 return 0;
11166 }
11167 return 1;
11168}
11169
11170int
11171_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11172{
11173 size_t len;
11174 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011175 assert(str);
11176#ifndef NDEBUG
11177 for (const char *p = str; *p; p++) {
11178 assert((unsigned char)*p < 128);
11179 }
11180#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011181 if (PyUnicode_READY(unicode) == -1) {
11182 /* Memory error or bad data */
11183 PyErr_Clear();
11184 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11185 }
11186 if (!PyUnicode_IS_ASCII(unicode))
11187 return 0;
11188 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11189 return strlen(str) == len &&
11190 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11191}
11192
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011193int
11194_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11195{
11196 PyObject *right_uni;
11197 Py_hash_t hash;
11198
11199 assert(_PyUnicode_CHECK(left));
11200 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011201#ifndef NDEBUG
11202 for (const char *p = right->string; *p; p++) {
11203 assert((unsigned char)*p < 128);
11204 }
11205#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011206
11207 if (PyUnicode_READY(left) == -1) {
11208 /* memory error or bad data */
11209 PyErr_Clear();
11210 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11211 }
11212
11213 if (!PyUnicode_IS_ASCII(left))
11214 return 0;
11215
11216 right_uni = _PyUnicode_FromId(right); /* borrowed */
11217 if (right_uni == NULL) {
11218 /* memory error or bad data */
11219 PyErr_Clear();
11220 return _PyUnicode_EqualToASCIIString(left, right->string);
11221 }
11222
11223 if (left == right_uni)
11224 return 1;
11225
11226 if (PyUnicode_CHECK_INTERNED(left))
11227 return 0;
11228
INADA Naoki7cc95f52018-01-28 02:07:09 +090011229 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011230 hash = _PyUnicode_HASH(left);
11231 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11232 return 0;
11233
11234 return unicode_compare_eq(left, right_uni);
11235}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011236
Alexander Belopolsky40018472011-02-26 01:02:56 +000011237PyObject *
11238PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011239{
11240 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011241
Victor Stinnere5567ad2012-10-23 02:48:49 +020011242 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11243 Py_RETURN_NOTIMPLEMENTED;
11244
11245 if (PyUnicode_READY(left) == -1 ||
11246 PyUnicode_READY(right) == -1)
11247 return NULL;
11248
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011249 if (left == right) {
11250 switch (op) {
11251 case Py_EQ:
11252 case Py_LE:
11253 case Py_GE:
11254 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011255 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011256 case Py_NE:
11257 case Py_LT:
11258 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011259 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011260 default:
11261 PyErr_BadArgument();
11262 return NULL;
11263 }
11264 }
11265 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011266 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011267 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011268 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011269 }
11270 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011271 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011272 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011273 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011274}
11275
Alexander Belopolsky40018472011-02-26 01:02:56 +000011276int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011277_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11278{
11279 return unicode_eq(aa, bb);
11280}
11281
11282int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011284{
Victor Stinner77282cb2013-04-14 19:22:47 +020011285 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 void *buf1, *buf2;
11287 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011288 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011289
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011290 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011292 "'in <string>' requires string as left operand, not %.100s",
11293 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011294 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011295 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011297 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011298 if (ensure_unicode(str) < 0)
11299 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 kind2 = PyUnicode_KIND(substr);
11303 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011304 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011306 len2 = PyUnicode_GET_LENGTH(substr);
11307 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011308 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011309 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011310 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011311 if (len2 == 1) {
11312 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11313 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011314 return result;
11315 }
11316 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011317 buf2 = _PyUnicode_AsKind(substr, kind1);
11318 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011319 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321
Victor Stinner77282cb2013-04-14 19:22:47 +020011322 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 case PyUnicode_1BYTE_KIND:
11324 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11325 break;
11326 case PyUnicode_2BYTE_KIND:
11327 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11328 break;
11329 case PyUnicode_4BYTE_KIND:
11330 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11331 break;
11332 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011333 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011335
Victor Stinner77282cb2013-04-14 19:22:47 +020011336 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 PyMem_Free(buf2);
11338
Guido van Rossum403d68b2000-03-13 15:55:09 +000011339 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011340}
11341
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342/* Concat to string or Unicode object giving a new Unicode object. */
11343
Alexander Belopolsky40018472011-02-26 01:02:56 +000011344PyObject *
11345PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011347 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011348 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011349 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011351 if (ensure_unicode(left) < 0)
11352 return NULL;
11353
11354 if (!PyUnicode_Check(right)) {
11355 PyErr_Format(PyExc_TypeError,
11356 "can only concatenate str (not \"%.200s\") to str",
11357 right->ob_type->tp_name);
11358 return NULL;
11359 }
11360 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
11363 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011364 if (left == unicode_empty)
11365 return PyUnicode_FromObject(right);
11366 if (right == unicode_empty)
11367 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369 left_len = PyUnicode_GET_LENGTH(left);
11370 right_len = PyUnicode_GET_LENGTH(right);
11371 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011372 PyErr_SetString(PyExc_OverflowError,
11373 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011374 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011375 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011376 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011377
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011378 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11379 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011380 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011383 result = PyUnicode_New(new_len, maxchar);
11384 if (result == NULL)
11385 return NULL;
11386 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11387 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11388 assert(_PyUnicode_CheckConsistency(result, 1));
11389 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390}
11391
Walter Dörwald1ab83302007-05-18 17:15:44 +000011392void
Victor Stinner23e56682011-10-03 03:54:37 +020011393PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011394{
Victor Stinner23e56682011-10-03 03:54:37 +020011395 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011396 Py_UCS4 maxchar, maxchar2;
11397 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011398
11399 if (p_left == NULL) {
11400 if (!PyErr_Occurred())
11401 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011402 return;
11403 }
Victor Stinner23e56682011-10-03 03:54:37 +020011404 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011405 if (right == NULL || left == NULL
11406 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011407 if (!PyErr_Occurred())
11408 PyErr_BadInternalCall();
11409 goto error;
11410 }
11411
Benjamin Petersonbac79492012-01-14 13:34:47 -050011412 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011413 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011414 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011415 goto error;
11416
Victor Stinner488fa492011-12-12 00:01:39 +010011417 /* Shortcuts */
11418 if (left == unicode_empty) {
11419 Py_DECREF(left);
11420 Py_INCREF(right);
11421 *p_left = right;
11422 return;
11423 }
11424 if (right == unicode_empty)
11425 return;
11426
11427 left_len = PyUnicode_GET_LENGTH(left);
11428 right_len = PyUnicode_GET_LENGTH(right);
11429 if (left_len > PY_SSIZE_T_MAX - right_len) {
11430 PyErr_SetString(PyExc_OverflowError,
11431 "strings are too large to concat");
11432 goto error;
11433 }
11434 new_len = left_len + right_len;
11435
11436 if (unicode_modifiable(left)
11437 && PyUnicode_CheckExact(right)
11438 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011439 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11440 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011441 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011442 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011443 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11444 {
11445 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011446 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011447 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011448
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011449 /* copy 'right' into the newly allocated area of 'left' */
11450 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011451 }
Victor Stinner488fa492011-12-12 00:01:39 +010011452 else {
11453 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11454 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011455 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011456
Victor Stinner488fa492011-12-12 00:01:39 +010011457 /* Concat the two Unicode strings */
11458 res = PyUnicode_New(new_len, maxchar);
11459 if (res == NULL)
11460 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011461 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11462 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011463 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011464 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011465 }
11466 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011467 return;
11468
11469error:
Victor Stinner488fa492011-12-12 00:01:39 +010011470 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011471}
11472
11473void
11474PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11475{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011476 PyUnicode_Append(pleft, right);
11477 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011478}
11479
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011480/*
11481Wraps stringlib_parse_args_finds() and additionally ensures that the
11482first argument is a unicode object.
11483*/
11484
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011485static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011486parse_args_finds_unicode(const char * function_name, PyObject *args,
11487 PyObject **substring,
11488 Py_ssize_t *start, Py_ssize_t *end)
11489{
11490 if(stringlib_parse_args_finds(function_name, args, substring,
11491 start, end)) {
11492 if (ensure_unicode(*substring) < 0)
11493 return 0;
11494 return 1;
11495 }
11496 return 0;
11497}
11498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011499PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011502Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011503string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011504interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
11506static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011507unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011509 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011510 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011511 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011513 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 void *buf1, *buf2;
11515 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011517 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 kind1 = PyUnicode_KIND(self);
11521 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011522 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011523 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 len1 = PyUnicode_GET_LENGTH(self);
11526 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011528 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011529 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011530
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011531 buf1 = PyUnicode_DATA(self);
11532 buf2 = PyUnicode_DATA(substring);
11533 if (kind2 != kind1) {
11534 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011535 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011536 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011537 }
11538 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 case PyUnicode_1BYTE_KIND:
11540 iresult = ucs1lib_count(
11541 ((Py_UCS1*)buf1) + start, end - start,
11542 buf2, len2, PY_SSIZE_T_MAX
11543 );
11544 break;
11545 case PyUnicode_2BYTE_KIND:
11546 iresult = ucs2lib_count(
11547 ((Py_UCS2*)buf1) + start, end - start,
11548 buf2, len2, PY_SSIZE_T_MAX
11549 );
11550 break;
11551 case PyUnicode_4BYTE_KIND:
11552 iresult = ucs4lib_count(
11553 ((Py_UCS4*)buf1) + start, end - start,
11554 buf2, len2, PY_SSIZE_T_MAX
11555 );
11556 break;
11557 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011558 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 }
11560
11561 result = PyLong_FromSsize_t(iresult);
11562
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011563 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 return result;
11567}
11568
INADA Naoki3ae20562017-01-16 20:41:20 +090011569/*[clinic input]
11570str.encode as unicode_encode
11571
11572 encoding: str(c_default="NULL") = 'utf-8'
11573 The encoding in which to encode the string.
11574 errors: str(c_default="NULL") = 'strict'
11575 The error handling scheme to use for encoding errors.
11576 The default is 'strict' meaning that encoding errors raise a
11577 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11578 'xmlcharrefreplace' as well as any other name registered with
11579 codecs.register_error that can handle UnicodeEncodeErrors.
11580
11581Encode the string using the codec registered for encoding.
11582[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583
11584static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011585unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011586/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011588 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011589}
11590
INADA Naoki3ae20562017-01-16 20:41:20 +090011591/*[clinic input]
11592str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593
INADA Naoki3ae20562017-01-16 20:41:20 +090011594 tabsize: int = 8
11595
11596Return a copy where all tab characters are expanded using spaces.
11597
11598If tabsize is not given, a tab size of 8 characters is assumed.
11599[clinic start generated code]*/
11600
11601static PyObject *
11602unicode_expandtabs_impl(PyObject *self, int tabsize)
11603/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011605 Py_ssize_t i, j, line_pos, src_len, incr;
11606 Py_UCS4 ch;
11607 PyObject *u;
11608 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011609 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011610 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
Antoine Pitrou22425222011-10-04 19:10:51 +020011612 if (PyUnicode_READY(self) == -1)
11613 return NULL;
11614
Thomas Wouters7e474022000-07-16 12:04:32 +000011615 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011616 src_len = PyUnicode_GET_LENGTH(self);
11617 i = j = line_pos = 0;
11618 kind = PyUnicode_KIND(self);
11619 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011620 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011621 for (; i < src_len; i++) {
11622 ch = PyUnicode_READ(kind, src_data, i);
11623 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011624 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011626 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011628 goto overflow;
11629 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011631 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011635 goto overflow;
11636 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011638 if (ch == '\n' || ch == '\r')
11639 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011641 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011642 if (!found)
11643 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011644
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011646 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647 if (!u)
11648 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011649 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650
Antoine Pitroue71d5742011-10-04 15:55:09 +020011651 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652
Antoine Pitroue71d5742011-10-04 15:55:09 +020011653 for (; i < src_len; i++) {
11654 ch = PyUnicode_READ(kind, src_data, i);
11655 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011657 incr = tabsize - (line_pos % tabsize);
11658 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011659 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011660 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011664 line_pos++;
11665 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011666 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011667 if (ch == '\n' || ch == '\r')
11668 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011670 }
11671 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011672 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011673
Antoine Pitroue71d5742011-10-04 15:55:09 +020011674 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011675 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677}
11678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011679PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681\n\
11682Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011683such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684arguments start and end are interpreted as in slice notation.\n\
11685\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
11688static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011691 /* initialize variables to prevent gcc warning */
11692 PyObject *substring = NULL;
11693 Py_ssize_t start = 0;
11694 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011695 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011697 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011700 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011703 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (result == -2)
11706 return NULL;
11707
Christian Heimes217cfd12007-12-02 14:31:20 +000011708 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709}
11710
11711static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011712unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011714 void *data;
11715 enum PyUnicode_Kind kind;
11716 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011717
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011718 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011719 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011721 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011722 if (PyUnicode_READY(self) == -1) {
11723 return NULL;
11724 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011725 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11726 PyErr_SetString(PyExc_IndexError, "string index out of range");
11727 return NULL;
11728 }
11729 kind = PyUnicode_KIND(self);
11730 data = PyUnicode_DATA(self);
11731 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011732 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733}
11734
Guido van Rossumc2504932007-09-18 19:42:40 +000011735/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011736 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011737static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011738unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011740 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011741
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011742#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011743 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011744#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (_PyUnicode_HASH(self) != -1)
11746 return _PyUnicode_HASH(self);
11747 if (PyUnicode_READY(self) == -1)
11748 return -1;
animalizea1d14252019-01-02 20:16:06 +080011749
Christian Heimes985ecdc2013-11-20 11:46:18 +010011750 x = _Py_HashBytes(PyUnicode_DATA(self),
11751 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011753 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754}
11755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011756PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758\n\
oldkaa0735f2018-02-02 16:52:55 +080011759Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011760such that sub is contained within S[start:end]. Optional\n\
11761arguments start and end are interpreted as in slice notation.\n\
11762\n\
11763Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764
11765static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011768 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011769 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011770 PyObject *substring = NULL;
11771 Py_ssize_t start = 0;
11772 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011774 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011777 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011780 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (result == -2)
11783 return NULL;
11784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 if (result < 0) {
11786 PyErr_SetString(PyExc_ValueError, "substring not found");
11787 return NULL;
11788 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011789
Christian Heimes217cfd12007-12-02 14:31:20 +000011790 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791}
11792
INADA Naoki3ae20562017-01-16 20:41:20 +090011793/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011794str.isascii as unicode_isascii
11795
11796Return True if all characters in the string are ASCII, False otherwise.
11797
11798ASCII characters have code points in the range U+0000-U+007F.
11799Empty string is ASCII too.
11800[clinic start generated code]*/
11801
11802static PyObject *
11803unicode_isascii_impl(PyObject *self)
11804/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11805{
11806 if (PyUnicode_READY(self) == -1) {
11807 return NULL;
11808 }
11809 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11810}
11811
11812/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011813str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
INADA Naoki3ae20562017-01-16 20:41:20 +090011815Return True if the string is a lowercase string, False otherwise.
11816
11817A string is lowercase if all cased characters in the string are lowercase and
11818there is at least one cased character in the string.
11819[clinic start generated code]*/
11820
11821static PyObject *
11822unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011823/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 Py_ssize_t i, length;
11826 int kind;
11827 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 int cased;
11829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (PyUnicode_READY(self) == -1)
11831 return NULL;
11832 length = PyUnicode_GET_LENGTH(self);
11833 kind = PyUnicode_KIND(self);
11834 data = PyUnicode_DATA(self);
11835
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (length == 1)
11838 return PyBool_FromLong(
11839 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011841 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011843 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 for (i = 0; i < length; i++) {
11847 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011848
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011850 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 else if (!cased && Py_UNICODE_ISLOWER(ch))
11852 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011854 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855}
11856
INADA Naoki3ae20562017-01-16 20:41:20 +090011857/*[clinic input]
11858str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859
INADA Naoki3ae20562017-01-16 20:41:20 +090011860Return True if the string is an uppercase string, False otherwise.
11861
11862A string is uppercase if all cased characters in the string are uppercase and
11863there is at least one cased character in the string.
11864[clinic start generated code]*/
11865
11866static PyObject *
11867unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011868/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 Py_ssize_t i, length;
11871 int kind;
11872 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 int cased;
11874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (PyUnicode_READY(self) == -1)
11876 return NULL;
11877 length = PyUnicode_GET_LENGTH(self);
11878 kind = PyUnicode_KIND(self);
11879 data = PyUnicode_DATA(self);
11880
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (length == 1)
11883 return PyBool_FromLong(
11884 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011886 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011888 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011889
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 for (i = 0; i < length; i++) {
11892 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011893
Benjamin Peterson29060642009-01-31 22:14:21 +000011894 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011895 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 else if (!cased && Py_UNICODE_ISUPPER(ch))
11897 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011899 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900}
11901
INADA Naoki3ae20562017-01-16 20:41:20 +090011902/*[clinic input]
11903str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
INADA Naoki3ae20562017-01-16 20:41:20 +090011905Return True if the string is a title-cased string, False otherwise.
11906
11907In a title-cased string, upper- and title-case characters may only
11908follow uncased characters and lowercase characters only cased ones.
11909[clinic start generated code]*/
11910
11911static PyObject *
11912unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011913/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 Py_ssize_t i, length;
11916 int kind;
11917 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 int cased, previous_is_cased;
11919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 if (PyUnicode_READY(self) == -1)
11921 return NULL;
11922 length = PyUnicode_GET_LENGTH(self);
11923 kind = PyUnicode_KIND(self);
11924 data = PyUnicode_DATA(self);
11925
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 if (length == 1) {
11928 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11929 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11930 (Py_UNICODE_ISUPPER(ch) != 0));
11931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011933 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011935 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011936
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 cased = 0;
11938 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 for (i = 0; i < length; i++) {
11940 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011941
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11943 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011944 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 previous_is_cased = 1;
11946 cased = 1;
11947 }
11948 else if (Py_UNICODE_ISLOWER(ch)) {
11949 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011950 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011951 previous_is_cased = 1;
11952 cased = 1;
11953 }
11954 else
11955 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011957 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958}
11959
INADA Naoki3ae20562017-01-16 20:41:20 +090011960/*[clinic input]
11961str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962
INADA Naoki3ae20562017-01-16 20:41:20 +090011963Return True if the string is a whitespace string, False otherwise.
11964
11965A string is whitespace if all characters in the string are whitespace and there
11966is at least one character in the string.
11967[clinic start generated code]*/
11968
11969static PyObject *
11970unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011971/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 Py_ssize_t i, length;
11974 int kind;
11975 void *data;
11976
11977 if (PyUnicode_READY(self) == -1)
11978 return NULL;
11979 length = PyUnicode_GET_LENGTH(self);
11980 kind = PyUnicode_KIND(self);
11981 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 if (length == 1)
11985 return PyBool_FromLong(
11986 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011988 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011990 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 for (i = 0; i < length; i++) {
11993 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011994 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011995 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011997 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998}
11999
INADA Naoki3ae20562017-01-16 20:41:20 +090012000/*[clinic input]
12001str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002
INADA Naoki3ae20562017-01-16 20:41:20 +090012003Return True if the string is an alphabetic string, False otherwise.
12004
12005A string is alphabetic if all characters in the string are alphabetic and there
12006is at least one character in the string.
12007[clinic start generated code]*/
12008
12009static PyObject *
12010unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012011/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 Py_ssize_t i, length;
12014 int kind;
12015 void *data;
12016
12017 if (PyUnicode_READY(self) == -1)
12018 return NULL;
12019 length = PyUnicode_GET_LENGTH(self);
12020 kind = PyUnicode_KIND(self);
12021 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012022
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012023 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (length == 1)
12025 return PyBool_FromLong(
12026 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012027
12028 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012030 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 for (i = 0; i < length; i++) {
12033 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012034 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012035 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012036 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012037}
12038
INADA Naoki3ae20562017-01-16 20:41:20 +090012039/*[clinic input]
12040str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012041
INADA Naoki3ae20562017-01-16 20:41:20 +090012042Return True if the string is an alpha-numeric string, False otherwise.
12043
12044A string is alpha-numeric if all characters in the string are alpha-numeric and
12045there is at least one character in the string.
12046[clinic start generated code]*/
12047
12048static PyObject *
12049unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012050/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 int kind;
12053 void *data;
12054 Py_ssize_t len, i;
12055
12056 if (PyUnicode_READY(self) == -1)
12057 return NULL;
12058
12059 kind = PyUnicode_KIND(self);
12060 data = PyUnicode_DATA(self);
12061 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012062
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012063 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (len == 1) {
12065 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12066 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12067 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012068
12069 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012071 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 for (i = 0; i < len; i++) {
12074 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012075 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012076 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012077 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012078 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012079}
12080
INADA Naoki3ae20562017-01-16 20:41:20 +090012081/*[clinic input]
12082str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
INADA Naoki3ae20562017-01-16 20:41:20 +090012084Return True if the string is a decimal string, False otherwise.
12085
12086A string is a decimal string if all characters in the string are decimal and
12087there is at least one character in the string.
12088[clinic start generated code]*/
12089
12090static PyObject *
12091unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012092/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 Py_ssize_t i, length;
12095 int kind;
12096 void *data;
12097
12098 if (PyUnicode_READY(self) == -1)
12099 return NULL;
12100 length = PyUnicode_GET_LENGTH(self);
12101 kind = PyUnicode_KIND(self);
12102 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (length == 1)
12106 return PyBool_FromLong(
12107 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012109 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012111 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 for (i = 0; i < length; i++) {
12114 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012117 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118}
12119
INADA Naoki3ae20562017-01-16 20:41:20 +090012120/*[clinic input]
12121str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
INADA Naoki3ae20562017-01-16 20:41:20 +090012123Return True if the string is a digit string, False otherwise.
12124
12125A string is a digit string if all characters in the string are digits and there
12126is at least one character in the string.
12127[clinic start generated code]*/
12128
12129static PyObject *
12130unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012131/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 Py_ssize_t i, length;
12134 int kind;
12135 void *data;
12136
12137 if (PyUnicode_READY(self) == -1)
12138 return NULL;
12139 length = PyUnicode_GET_LENGTH(self);
12140 kind = PyUnicode_KIND(self);
12141 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (length == 1) {
12145 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12146 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012149 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012151 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 for (i = 0; i < length; i++) {
12154 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012155 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012157 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is a numeric string, False otherwise.
12164
12165A string is numeric if all characters in the string are numeric and there is at
12166least one character in the string.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_ssize_t i, length;
12174 int kind;
12175 void *data;
12176
12177 if (PyUnicode_READY(self) == -1)
12178 return NULL;
12179 length = PyUnicode_GET_LENGTH(self);
12180 kind = PyUnicode_KIND(self);
12181 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (length == 1)
12185 return PyBool_FromLong(
12186 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012188 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012190 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 for (i = 0; i < length; i++) {
12193 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012194 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012196 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197}
12198
Martin v. Löwis47383402007-08-15 07:32:56 +000012199int
12200PyUnicode_IsIdentifier(PyObject *self)
12201{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 int kind;
12203 void *data;
12204 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012205 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 if (PyUnicode_READY(self) == -1) {
12208 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 }
12211
12212 /* Special case for empty strings */
12213 if (PyUnicode_GET_LENGTH(self) == 0)
12214 return 0;
12215 kind = PyUnicode_KIND(self);
12216 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012217
12218 /* PEP 3131 says that the first character must be in
12219 XID_Start and subsequent characters in XID_Continue,
12220 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012221 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012222 letters, digits, underscore). However, given the current
12223 definition of XID_Start and XID_Continue, it is sufficient
12224 to check just for these, except that _ must be allowed
12225 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012227 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012228 return 0;
12229
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012230 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012233 return 1;
12234}
12235
INADA Naoki3ae20562017-01-16 20:41:20 +090012236/*[clinic input]
12237str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012238
INADA Naoki3ae20562017-01-16 20:41:20 +090012239Return True if the string is a valid Python identifier, False otherwise.
12240
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012241Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012242such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012243[clinic start generated code]*/
12244
12245static PyObject *
12246unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012247/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012248{
12249 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12250}
12251
INADA Naoki3ae20562017-01-16 20:41:20 +090012252/*[clinic input]
12253str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012254
INADA Naoki3ae20562017-01-16 20:41:20 +090012255Return True if the string is printable, False otherwise.
12256
12257A string is printable if all of its characters are considered printable in
12258repr() or if it is empty.
12259[clinic start generated code]*/
12260
12261static PyObject *
12262unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012263/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 Py_ssize_t i, length;
12266 int kind;
12267 void *data;
12268
12269 if (PyUnicode_READY(self) == -1)
12270 return NULL;
12271 length = PyUnicode_GET_LENGTH(self);
12272 kind = PyUnicode_KIND(self);
12273 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012274
12275 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 if (length == 1)
12277 return PyBool_FromLong(
12278 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 for (i = 0; i < length; i++) {
12281 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012282 Py_RETURN_FALSE;
12283 }
12284 }
12285 Py_RETURN_TRUE;
12286}
12287
INADA Naoki3ae20562017-01-16 20:41:20 +090012288/*[clinic input]
12289str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
INADA Naoki3ae20562017-01-16 20:41:20 +090012291 iterable: object
12292 /
12293
12294Concatenate any number of strings.
12295
Martin Panter91a88662017-01-24 00:30:06 +000012296The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012297The result is returned as a new string.
12298
12299Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12300[clinic start generated code]*/
12301
12302static PyObject *
12303unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012304/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305{
INADA Naoki3ae20562017-01-16 20:41:20 +090012306 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307}
12308
Martin v. Löwis18e16552006-02-15 17:27:45 +000012309static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012310unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 if (PyUnicode_READY(self) == -1)
12313 return -1;
12314 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315}
12316
INADA Naoki3ae20562017-01-16 20:41:20 +090012317/*[clinic input]
12318str.ljust as unicode_ljust
12319
12320 width: Py_ssize_t
12321 fillchar: Py_UCS4 = ' '
12322 /
12323
12324Return a left-justified string of length width.
12325
12326Padding is done using the specified fill character (default is a space).
12327[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
12329static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012330unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12331/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012333 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
Victor Stinnerc4b49542011-12-11 22:44:26 +010012336 if (PyUnicode_GET_LENGTH(self) >= width)
12337 return unicode_result_unchanged(self);
12338
12339 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340}
12341
INADA Naoki3ae20562017-01-16 20:41:20 +090012342/*[clinic input]
12343str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344
INADA Naoki3ae20562017-01-16 20:41:20 +090012345Return a copy of the string converted to lowercase.
12346[clinic start generated code]*/
12347
12348static PyObject *
12349unicode_lower_impl(PyObject *self)
12350/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012352 if (PyUnicode_READY(self) == -1)
12353 return NULL;
12354 if (PyUnicode_IS_ASCII(self))
12355 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012356 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357}
12358
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012359#define LEFTSTRIP 0
12360#define RIGHTSTRIP 1
12361#define BOTHSTRIP 2
12362
12363/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012364static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012365
INADA Naoki3ae20562017-01-16 20:41:20 +090012366#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012367
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012368/* externally visible for str.strip(unicode) */
12369PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012370_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 void *data;
12373 int kind;
12374 Py_ssize_t i, j, len;
12375 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012376 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12379 return NULL;
12380
12381 kind = PyUnicode_KIND(self);
12382 data = PyUnicode_DATA(self);
12383 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012384 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12386 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012387 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012388
Benjamin Peterson14339b62009-01-31 16:36:08 +000012389 i = 0;
12390 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012391 while (i < len) {
12392 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12393 if (!BLOOM(sepmask, ch))
12394 break;
12395 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12396 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 i++;
12398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012399 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012400
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 j = len;
12402 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012403 j--;
12404 while (j >= i) {
12405 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12406 if (!BLOOM(sepmask, ch))
12407 break;
12408 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12409 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012411 }
12412
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012414 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012415
Victor Stinner7931d9a2011-11-04 00:22:48 +010012416 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417}
12418
12419PyObject*
12420PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12421{
12422 unsigned char *data;
12423 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012424 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425
Victor Stinnerde636f32011-10-01 03:55:54 +020012426 if (PyUnicode_READY(self) == -1)
12427 return NULL;
12428
Victor Stinner684d5fd2012-05-03 02:32:34 +020012429 length = PyUnicode_GET_LENGTH(self);
12430 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012431
Victor Stinner684d5fd2012-05-03 02:32:34 +020012432 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012433 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434
Victor Stinnerde636f32011-10-01 03:55:54 +020012435 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012436 PyErr_SetString(PyExc_IndexError, "string index out of range");
12437 return NULL;
12438 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012439 if (start >= length || end < start)
12440 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012441
Victor Stinner684d5fd2012-05-03 02:32:34 +020012442 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012443 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012444 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012445 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012446 }
12447 else {
12448 kind = PyUnicode_KIND(self);
12449 data = PyUnicode_1BYTE_DATA(self);
12450 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012451 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012452 length);
12453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455
12456static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012457do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 Py_ssize_t len, i, j;
12460
12461 if (PyUnicode_READY(self) == -1)
12462 return NULL;
12463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012465
Victor Stinnercc7af722013-04-09 22:39:24 +020012466 if (PyUnicode_IS_ASCII(self)) {
12467 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12468
12469 i = 0;
12470 if (striptype != RIGHTSTRIP) {
12471 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012472 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012473 if (!_Py_ascii_whitespace[ch])
12474 break;
12475 i++;
12476 }
12477 }
12478
12479 j = len;
12480 if (striptype != LEFTSTRIP) {
12481 j--;
12482 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012483 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012484 if (!_Py_ascii_whitespace[ch])
12485 break;
12486 j--;
12487 }
12488 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012489 }
12490 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012491 else {
12492 int kind = PyUnicode_KIND(self);
12493 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494
Victor Stinnercc7af722013-04-09 22:39:24 +020012495 i = 0;
12496 if (striptype != RIGHTSTRIP) {
12497 while (i < len) {
12498 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12499 if (!Py_UNICODE_ISSPACE(ch))
12500 break;
12501 i++;
12502 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012503 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012504
12505 j = len;
12506 if (striptype != LEFTSTRIP) {
12507 j--;
12508 while (j >= i) {
12509 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12510 if (!Py_UNICODE_ISSPACE(ch))
12511 break;
12512 j--;
12513 }
12514 j++;
12515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012516 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012517
Victor Stinner7931d9a2011-11-04 00:22:48 +010012518 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519}
12520
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012521
12522static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012523do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012524{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012525 if (sep != NULL && sep != Py_None) {
12526 if (PyUnicode_Check(sep))
12527 return _PyUnicode_XStrip(self, striptype, sep);
12528 else {
12529 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "%s arg must be None or str",
12531 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012532 return NULL;
12533 }
12534 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012535
Benjamin Peterson14339b62009-01-31 16:36:08 +000012536 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012537}
12538
12539
INADA Naoki3ae20562017-01-16 20:41:20 +090012540/*[clinic input]
12541str.strip as unicode_strip
12542
12543 chars: object = None
12544 /
12545
Victor Stinner0c4a8282017-01-17 02:21:47 +010012546Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012547
12548If chars is given and not None, remove characters in chars instead.
12549[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012550
12551static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012552unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012553/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012554{
INADA Naoki3ae20562017-01-16 20:41:20 +090012555 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012556}
12557
12558
INADA Naoki3ae20562017-01-16 20:41:20 +090012559/*[clinic input]
12560str.lstrip as unicode_lstrip
12561
12562 chars: object = NULL
12563 /
12564
12565Return a copy of the string with leading whitespace removed.
12566
12567If chars is given and not None, remove characters in chars instead.
12568[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012569
12570static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012571unicode_lstrip_impl(PyObject *self, PyObject *chars)
12572/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012573{
INADA Naoki3ae20562017-01-16 20:41:20 +090012574 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012575}
12576
12577
INADA Naoki3ae20562017-01-16 20:41:20 +090012578/*[clinic input]
12579str.rstrip as unicode_rstrip
12580
12581 chars: object = NULL
12582 /
12583
12584Return a copy of the string with trailing whitespace removed.
12585
12586If chars is given and not None, remove characters in chars instead.
12587[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012588
12589static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012590unicode_rstrip_impl(PyObject *self, PyObject *chars)
12591/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012592{
INADA Naoki3ae20562017-01-16 20:41:20 +090012593 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012594}
12595
12596
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012598unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012600 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602
Serhiy Storchaka05997252013-01-26 12:14:02 +020012603 if (len < 1)
12604 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605
Victor Stinnerc4b49542011-12-11 22:44:26 +010012606 /* no repeat, return original string */
12607 if (len == 1)
12608 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012609
Benjamin Petersonbac79492012-01-14 13:34:47 -050012610 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 return NULL;
12612
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012613 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012614 PyErr_SetString(PyExc_OverflowError,
12615 "repeated string is too long");
12616 return NULL;
12617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012619
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012620 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621 if (!u)
12622 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012623 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 if (PyUnicode_GET_LENGTH(str) == 1) {
12626 const int kind = PyUnicode_KIND(str);
12627 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012628 if (kind == PyUnicode_1BYTE_KIND) {
12629 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012630 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012631 }
12632 else if (kind == PyUnicode_2BYTE_KIND) {
12633 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012634 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012635 ucs2[n] = fill_char;
12636 } else {
12637 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12638 assert(kind == PyUnicode_4BYTE_KIND);
12639 for (n = 0; n < len; ++n)
12640 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 }
12643 else {
12644 /* number of characters copied this far */
12645 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012646 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012648 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012650 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012652 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012653 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655 }
12656
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012657 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012658 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659}
12660
Alexander Belopolsky40018472011-02-26 01:02:56 +000012661PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012662PyUnicode_Replace(PyObject *str,
12663 PyObject *substr,
12664 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012665 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012667 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12668 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012670 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671}
12672
INADA Naoki3ae20562017-01-16 20:41:20 +090012673/*[clinic input]
12674str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
INADA Naoki3ae20562017-01-16 20:41:20 +090012676 old: unicode
12677 new: unicode
12678 count: Py_ssize_t = -1
12679 Maximum number of occurrences to replace.
12680 -1 (the default value) means replace all occurrences.
12681 /
12682
12683Return a copy with all occurrences of substring old replaced by new.
12684
12685If the optional argument count is given, only the first count occurrences are
12686replaced.
12687[clinic start generated code]*/
12688
12689static PyObject *
12690unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12691 Py_ssize_t count)
12692/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012694 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012696 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697}
12698
Alexander Belopolsky40018472011-02-26 01:02:56 +000012699static PyObject *
12700unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012702 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 Py_ssize_t isize;
12704 Py_ssize_t osize, squote, dquote, i, o;
12705 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012706 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012710 return NULL;
12711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 isize = PyUnicode_GET_LENGTH(unicode);
12713 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 /* Compute length of output, quote characters, and
12716 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012717 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 max = 127;
12719 squote = dquote = 0;
12720 ikind = PyUnicode_KIND(unicode);
12721 for (i = 0; i < isize; i++) {
12722 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012723 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012725 case '\'': squote++; break;
12726 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012728 incr = 2;
12729 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 default:
12731 /* Fast-path ASCII */
12732 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012733 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012735 ;
12736 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012739 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012741 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012743 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012745 if (osize > PY_SSIZE_T_MAX - incr) {
12746 PyErr_SetString(PyExc_OverflowError,
12747 "string is too long to generate repr");
12748 return NULL;
12749 }
12750 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 }
12752
12753 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012754 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012756 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 if (dquote)
12758 /* Both squote and dquote present. Use squote,
12759 and escape them */
12760 osize += squote;
12761 else
12762 quote = '"';
12763 }
Victor Stinner55c08782013-04-14 18:45:39 +020012764 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765
12766 repr = PyUnicode_New(osize, max);
12767 if (repr == NULL)
12768 return NULL;
12769 okind = PyUnicode_KIND(repr);
12770 odata = PyUnicode_DATA(repr);
12771
12772 PyUnicode_WRITE(okind, odata, 0, quote);
12773 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012774 if (unchanged) {
12775 _PyUnicode_FastCopyCharacters(repr, 1,
12776 unicode, 0,
12777 isize);
12778 }
12779 else {
12780 for (i = 0, o = 1; i < isize; i++) {
12781 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782
Victor Stinner55c08782013-04-14 18:45:39 +020012783 /* Escape quotes and backslashes */
12784 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012785 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012787 continue;
12788 }
12789
12790 /* Map special whitespace to '\t', \n', '\r' */
12791 if (ch == '\t') {
12792 PyUnicode_WRITE(okind, odata, o++, '\\');
12793 PyUnicode_WRITE(okind, odata, o++, 't');
12794 }
12795 else if (ch == '\n') {
12796 PyUnicode_WRITE(okind, odata, o++, '\\');
12797 PyUnicode_WRITE(okind, odata, o++, 'n');
12798 }
12799 else if (ch == '\r') {
12800 PyUnicode_WRITE(okind, odata, o++, '\\');
12801 PyUnicode_WRITE(okind, odata, o++, 'r');
12802 }
12803
12804 /* Map non-printable US ASCII to '\xhh' */
12805 else if (ch < ' ' || ch == 0x7F) {
12806 PyUnicode_WRITE(okind, odata, o++, '\\');
12807 PyUnicode_WRITE(okind, odata, o++, 'x');
12808 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12809 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12810 }
12811
12812 /* Copy ASCII characters as-is */
12813 else if (ch < 0x7F) {
12814 PyUnicode_WRITE(okind, odata, o++, ch);
12815 }
12816
12817 /* Non-ASCII characters */
12818 else {
12819 /* Map Unicode whitespace and control characters
12820 (categories Z* and C* except ASCII space)
12821 */
12822 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12823 PyUnicode_WRITE(okind, odata, o++, '\\');
12824 /* Map 8-bit characters to '\xhh' */
12825 if (ch <= 0xff) {
12826 PyUnicode_WRITE(okind, odata, o++, 'x');
12827 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12828 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12829 }
12830 /* Map 16-bit characters to '\uxxxx' */
12831 else if (ch <= 0xffff) {
12832 PyUnicode_WRITE(okind, odata, o++, 'u');
12833 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12834 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12836 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12837 }
12838 /* Map 21-bit characters to '\U00xxxxxx' */
12839 else {
12840 PyUnicode_WRITE(okind, odata, o++, 'U');
12841 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12842 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12843 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12844 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12845 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12846 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12847 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12848 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12849 }
12850 }
12851 /* Copy characters as-is */
12852 else {
12853 PyUnicode_WRITE(okind, odata, o++, ch);
12854 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012855 }
12856 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012859 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012860 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861}
12862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012863PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865\n\
12866Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012867such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868arguments start and end are interpreted as in slice notation.\n\
12869\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012870Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871
12872static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012875 /* initialize variables to prevent gcc warning */
12876 PyObject *substring = NULL;
12877 Py_ssize_t start = 0;
12878 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012879 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012881 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012884 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012887 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 if (result == -2)
12890 return NULL;
12891
Christian Heimes217cfd12007-12-02 14:31:20 +000012892 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893}
12894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012895PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012896 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012898Return the highest index in S where substring sub is found,\n\
12899such that sub is contained within S[start:end]. Optional\n\
12900arguments start and end are interpreted as in slice notation.\n\
12901\n\
12902Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903
12904static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012907 /* initialize variables to prevent gcc warning */
12908 PyObject *substring = NULL;
12909 Py_ssize_t start = 0;
12910 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012913 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012916 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012919 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 if (result == -2)
12922 return NULL;
12923
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924 if (result < 0) {
12925 PyErr_SetString(PyExc_ValueError, "substring not found");
12926 return NULL;
12927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928
Christian Heimes217cfd12007-12-02 14:31:20 +000012929 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930}
12931
INADA Naoki3ae20562017-01-16 20:41:20 +090012932/*[clinic input]
12933str.rjust as unicode_rjust
12934
12935 width: Py_ssize_t
12936 fillchar: Py_UCS4 = ' '
12937 /
12938
12939Return a right-justified string of length width.
12940
12941Padding is done using the specified fill character (default is a space).
12942[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943
12944static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012945unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12946/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012948 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949 return NULL;
12950
Victor Stinnerc4b49542011-12-11 22:44:26 +010012951 if (PyUnicode_GET_LENGTH(self) >= width)
12952 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953
Victor Stinnerc4b49542011-12-11 22:44:26 +010012954 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955}
12956
Alexander Belopolsky40018472011-02-26 01:02:56 +000012957PyObject *
12958PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012960 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012963 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964}
12965
INADA Naoki3ae20562017-01-16 20:41:20 +090012966/*[clinic input]
12967str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968
INADA Naoki3ae20562017-01-16 20:41:20 +090012969 sep: object = None
12970 The delimiter according which to split the string.
12971 None (the default value) means split according to any whitespace,
12972 and discard empty strings from the result.
12973 maxsplit: Py_ssize_t = -1
12974 Maximum number of splits to do.
12975 -1 (the default value) means no limit.
12976
12977Return a list of the words in the string, using sep as the delimiter string.
12978[clinic start generated code]*/
12979
12980static PyObject *
12981unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12982/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983{
INADA Naoki3ae20562017-01-16 20:41:20 +090012984 if (sep == Py_None)
12985 return split(self, NULL, maxsplit);
12986 if (PyUnicode_Check(sep))
12987 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012988
Victor Stinner998b8062018-09-12 00:23:25 +020012989 PyErr_Format(PyExc_TypeError,
12990 "must be str or None, not %.100s",
12991 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993}
12994
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012996PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012997{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012999 int kind1, kind2;
13000 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013002
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013003 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013005
Victor Stinner14f8f022011-10-05 20:58:25 +020013006 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 len1 = PyUnicode_GET_LENGTH(str_obj);
13009 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 if (kind1 < kind2 || len1 < len2) {
13011 _Py_INCREF_UNICODE_EMPTY();
13012 if (!unicode_empty)
13013 out = NULL;
13014 else {
13015 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13016 Py_DECREF(unicode_empty);
13017 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013018 return out;
13019 }
13020 buf1 = PyUnicode_DATA(str_obj);
13021 buf2 = PyUnicode_DATA(sep_obj);
13022 if (kind2 != kind1) {
13023 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13024 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013025 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013028 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013030 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13031 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13032 else
13033 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 break;
13035 case PyUnicode_2BYTE_KIND:
13036 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13037 break;
13038 case PyUnicode_4BYTE_KIND:
13039 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13040 break;
13041 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013042 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013045 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013047
13048 return out;
13049}
13050
13051
13052PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013053PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013056 int kind1, kind2;
13057 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013059
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013060 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013061 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013063 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 len1 = PyUnicode_GET_LENGTH(str_obj);
13066 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013067 if (kind1 < kind2 || len1 < len2) {
13068 _Py_INCREF_UNICODE_EMPTY();
13069 if (!unicode_empty)
13070 out = NULL;
13071 else {
13072 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13073 Py_DECREF(unicode_empty);
13074 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013075 return out;
13076 }
13077 buf1 = PyUnicode_DATA(str_obj);
13078 buf2 = PyUnicode_DATA(sep_obj);
13079 if (kind2 != kind1) {
13080 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13081 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013083 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013084
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013085 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013087 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13088 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13089 else
13090 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 break;
13092 case PyUnicode_2BYTE_KIND:
13093 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13094 break;
13095 case PyUnicode_4BYTE_KIND:
13096 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13097 break;
13098 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013099 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013101
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013102 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013104
13105 return out;
13106}
13107
INADA Naoki3ae20562017-01-16 20:41:20 +090013108/*[clinic input]
13109str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013110
INADA Naoki3ae20562017-01-16 20:41:20 +090013111 sep: object
13112 /
13113
13114Partition the string into three parts using the given separator.
13115
13116This will search for the separator in the string. If the separator is found,
13117returns a 3-tuple containing the part before the separator, the separator
13118itself, and the part after it.
13119
13120If the separator is not found, returns a 3-tuple containing the original string
13121and two empty strings.
13122[clinic start generated code]*/
13123
13124static PyObject *
13125unicode_partition(PyObject *self, PyObject *sep)
13126/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013127{
INADA Naoki3ae20562017-01-16 20:41:20 +090013128 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013129}
13130
INADA Naoki3ae20562017-01-16 20:41:20 +090013131/*[clinic input]
13132str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013133
INADA Naoki3ae20562017-01-16 20:41:20 +090013134Partition the string into three parts using the given separator.
13135
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013136This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013137the separator is found, returns a 3-tuple containing the part before the
13138separator, the separator itself, and the part after it.
13139
13140If the separator is not found, returns a 3-tuple containing two empty strings
13141and the original string.
13142[clinic start generated code]*/
13143
13144static PyObject *
13145unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013146/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013147{
INADA Naoki3ae20562017-01-16 20:41:20 +090013148 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013149}
13150
Alexander Belopolsky40018472011-02-26 01:02:56 +000013151PyObject *
13152PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013153{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013154 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013155 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013156
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013157 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013158}
13159
INADA Naoki3ae20562017-01-16 20:41:20 +090013160/*[clinic input]
13161str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013162
INADA Naoki3ae20562017-01-16 20:41:20 +090013163Return a list of the words in the string, using sep as the delimiter string.
13164
13165Splits are done starting at the end of the string and working to the front.
13166[clinic start generated code]*/
13167
13168static PyObject *
13169unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13170/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013171{
INADA Naoki3ae20562017-01-16 20:41:20 +090013172 if (sep == Py_None)
13173 return rsplit(self, NULL, maxsplit);
13174 if (PyUnicode_Check(sep))
13175 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013176
Victor Stinner998b8062018-09-12 00:23:25 +020013177 PyErr_Format(PyExc_TypeError,
13178 "must be str or None, not %.100s",
13179 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013180 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013181}
13182
INADA Naoki3ae20562017-01-16 20:41:20 +090013183/*[clinic input]
13184str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013186 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013187
13188Return a list of the lines in the string, breaking at line boundaries.
13189
13190Line breaks are not included in the resulting list unless keepends is given and
13191true.
13192[clinic start generated code]*/
13193
13194static PyObject *
13195unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013196/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013198 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199}
13200
13201static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013202PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013204 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205}
13206
INADA Naoki3ae20562017-01-16 20:41:20 +090013207/*[clinic input]
13208str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209
INADA Naoki3ae20562017-01-16 20:41:20 +090013210Convert uppercase characters to lowercase and lowercase characters to uppercase.
13211[clinic start generated code]*/
13212
13213static PyObject *
13214unicode_swapcase_impl(PyObject *self)
13215/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013217 if (PyUnicode_READY(self) == -1)
13218 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013219 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013220}
13221
Larry Hastings61272b72014-01-07 12:41:53 -080013222/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013223
Larry Hastings31826802013-10-19 00:09:25 -070013224@staticmethod
13225str.maketrans as unicode_maketrans
13226
13227 x: object
13228
13229 y: unicode=NULL
13230
13231 z: unicode=NULL
13232
13233 /
13234
13235Return a translation table usable for str.translate().
13236
13237If there is only one argument, it must be a dictionary mapping Unicode
13238ordinals (integers) or characters to Unicode ordinals, strings or None.
13239Character keys will be then converted to ordinals.
13240If there are two arguments, they must be strings of equal length, and
13241in the resulting dictionary, each character in x will be mapped to the
13242character at the same position in y. If there is a third argument, it
13243must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013244[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013245
Larry Hastings31826802013-10-19 00:09:25 -070013246static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013247unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013248/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013249{
Georg Brandlceee0772007-11-27 23:48:05 +000013250 PyObject *new = NULL, *key, *value;
13251 Py_ssize_t i = 0;
13252 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253
Georg Brandlceee0772007-11-27 23:48:05 +000013254 new = PyDict_New();
13255 if (!new)
13256 return NULL;
13257 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 int x_kind, y_kind, z_kind;
13259 void *x_data, *y_data, *z_data;
13260
Georg Brandlceee0772007-11-27 23:48:05 +000013261 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013262 if (!PyUnicode_Check(x)) {
13263 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13264 "be a string if there is a second argument");
13265 goto err;
13266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013268 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13269 "arguments must have equal length");
13270 goto err;
13271 }
13272 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 x_kind = PyUnicode_KIND(x);
13274 y_kind = PyUnicode_KIND(y);
13275 x_data = PyUnicode_DATA(x);
13276 y_data = PyUnicode_DATA(y);
13277 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13278 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013279 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013280 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013281 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013282 if (!value) {
13283 Py_DECREF(key);
13284 goto err;
13285 }
Georg Brandlceee0772007-11-27 23:48:05 +000013286 res = PyDict_SetItem(new, key, value);
13287 Py_DECREF(key);
13288 Py_DECREF(value);
13289 if (res < 0)
13290 goto err;
13291 }
13292 /* create entries for deleting chars in z */
13293 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 z_kind = PyUnicode_KIND(z);
13295 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013296 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013298 if (!key)
13299 goto err;
13300 res = PyDict_SetItem(new, key, Py_None);
13301 Py_DECREF(key);
13302 if (res < 0)
13303 goto err;
13304 }
13305 }
13306 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 int kind;
13308 void *data;
13309
Georg Brandlceee0772007-11-27 23:48:05 +000013310 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013311 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013312 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13313 "to maketrans it must be a dict");
13314 goto err;
13315 }
13316 /* copy entries into the new dict, converting string keys to int keys */
13317 while (PyDict_Next(x, &i, &key, &value)) {
13318 if (PyUnicode_Check(key)) {
13319 /* convert string keys to integer keys */
13320 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013321 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013322 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13323 "table must be of length 1");
13324 goto err;
13325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 kind = PyUnicode_KIND(key);
13327 data = PyUnicode_DATA(key);
13328 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013329 if (!newkey)
13330 goto err;
13331 res = PyDict_SetItem(new, newkey, value);
13332 Py_DECREF(newkey);
13333 if (res < 0)
13334 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013335 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013336 /* just keep integer keys */
13337 if (PyDict_SetItem(new, key, value) < 0)
13338 goto err;
13339 } else {
13340 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13341 "be strings or integers");
13342 goto err;
13343 }
13344 }
13345 }
13346 return new;
13347 err:
13348 Py_DECREF(new);
13349 return NULL;
13350}
13351
INADA Naoki3ae20562017-01-16 20:41:20 +090013352/*[clinic input]
13353str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013354
INADA Naoki3ae20562017-01-16 20:41:20 +090013355 table: object
13356 Translation table, which must be a mapping of Unicode ordinals to
13357 Unicode ordinals, strings, or None.
13358 /
13359
13360Replace each character in the string using the given translation table.
13361
13362The table must implement lookup/indexing via __getitem__, for instance a
13363dictionary or list. If this operation raises LookupError, the character is
13364left untouched. Characters mapped to None are deleted.
13365[clinic start generated code]*/
13366
13367static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013369/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372}
13373
INADA Naoki3ae20562017-01-16 20:41:20 +090013374/*[clinic input]
13375str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376
INADA Naoki3ae20562017-01-16 20:41:20 +090013377Return a copy of the string converted to uppercase.
13378[clinic start generated code]*/
13379
13380static PyObject *
13381unicode_upper_impl(PyObject *self)
13382/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013384 if (PyUnicode_READY(self) == -1)
13385 return NULL;
13386 if (PyUnicode_IS_ASCII(self))
13387 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013388 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389}
13390
INADA Naoki3ae20562017-01-16 20:41:20 +090013391/*[clinic input]
13392str.zfill as unicode_zfill
13393
13394 width: Py_ssize_t
13395 /
13396
13397Pad a numeric string with zeros on the left, to fill a field of the given width.
13398
13399The string is never truncated.
13400[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401
13402static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013403unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013404/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013406 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013407 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013408 int kind;
13409 void *data;
13410 Py_UCS4 chr;
13411
Benjamin Petersonbac79492012-01-14 13:34:47 -050013412 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013413 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013414
Victor Stinnerc4b49542011-12-11 22:44:26 +010013415 if (PyUnicode_GET_LENGTH(self) >= width)
13416 return unicode_result_unchanged(self);
13417
13418 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419
13420 u = pad(self, fill, 0, '0');
13421
Walter Dörwald068325e2002-04-15 13:36:47 +000013422 if (u == NULL)
13423 return NULL;
13424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013425 kind = PyUnicode_KIND(u);
13426 data = PyUnicode_DATA(u);
13427 chr = PyUnicode_READ(kind, data, fill);
13428
13429 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013431 PyUnicode_WRITE(kind, data, 0, chr);
13432 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433 }
13434
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013435 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013436 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438
13439#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013440static PyObject *
13441unicode__decimal2ascii(PyObject *self)
13442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013444}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445#endif
13446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013447PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013450Return True if S starts with the specified prefix, False otherwise.\n\
13451With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013452With optional end, stop comparing S at that position.\n\
13453prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454
13455static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013456unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013459 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013460 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013461 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013462 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013463 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013464
Jesus Ceaac451502011-04-20 17:09:23 +020013465 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013467 if (PyTuple_Check(subobj)) {
13468 Py_ssize_t i;
13469 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013470 substring = PyTuple_GET_ITEM(subobj, i);
13471 if (!PyUnicode_Check(substring)) {
13472 PyErr_Format(PyExc_TypeError,
13473 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013474 "not %.100s",
13475 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013476 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013477 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013478 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013479 if (result == -1)
13480 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013481 if (result) {
13482 Py_RETURN_TRUE;
13483 }
13484 }
13485 /* nothing matched */
13486 Py_RETURN_FALSE;
13487 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013488 if (!PyUnicode_Check(subobj)) {
13489 PyErr_Format(PyExc_TypeError,
13490 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013491 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013493 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013494 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013495 if (result == -1)
13496 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013497 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013498}
13499
13500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013501PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013503\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013504Return True if S ends with the specified suffix, False otherwise.\n\
13505With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013506With optional end, stop comparing S at that position.\n\
13507suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508
13509static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013510unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013513 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013514 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013515 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013516 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013517 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518
Jesus Ceaac451502011-04-20 17:09:23 +020013519 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013521 if (PyTuple_Check(subobj)) {
13522 Py_ssize_t i;
13523 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013524 substring = PyTuple_GET_ITEM(subobj, i);
13525 if (!PyUnicode_Check(substring)) {
13526 PyErr_Format(PyExc_TypeError,
13527 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013528 "not %.100s",
13529 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013530 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013531 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013532 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013533 if (result == -1)
13534 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013535 if (result) {
13536 Py_RETURN_TRUE;
13537 }
13538 }
13539 Py_RETURN_FALSE;
13540 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013541 if (!PyUnicode_Check(subobj)) {
13542 PyErr_Format(PyExc_TypeError,
13543 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013544 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013546 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013547 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013548 if (result == -1)
13549 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013550 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013551}
13552
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013553static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013554_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013555{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013556 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13557 writer->data = PyUnicode_DATA(writer->buffer);
13558
13559 if (!writer->readonly) {
13560 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013561 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013562 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013563 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013564 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13565 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13566 writer->kind = PyUnicode_WCHAR_KIND;
13567 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13568
Victor Stinner8f674cc2013-04-17 23:02:17 +020013569 /* Copy-on-write mode: set buffer size to 0 so
13570 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13571 * next write. */
13572 writer->size = 0;
13573 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013574}
13575
Victor Stinnerd3f08822012-05-29 12:57:52 +020013576void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013577_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013578{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013580
13581 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013582 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013583
13584 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13585 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13586 writer->kind = PyUnicode_WCHAR_KIND;
13587 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013588}
13589
Inada Naoki770847a2019-06-24 12:30:24 +090013590// Initialize _PyUnicodeWriter with initial buffer
13591static inline void
13592_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13593{
13594 memset(writer, 0, sizeof(*writer));
13595 writer->buffer = buffer;
13596 _PyUnicodeWriter_Update(writer);
13597 writer->min_length = writer->size;
13598}
13599
Victor Stinnerd3f08822012-05-29 12:57:52 +020013600int
13601_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13602 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013603{
13604 Py_ssize_t newlen;
13605 PyObject *newbuffer;
13606
Victor Stinner2740e462016-09-06 16:58:36 -070013607 assert(maxchar <= MAX_UNICODE);
13608
Victor Stinnerca9381e2015-09-22 00:58:32 +020013609 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013610 assert((maxchar > writer->maxchar && length >= 0)
13611 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013612
Victor Stinner202fdca2012-05-07 12:47:02 +020013613 if (length > PY_SSIZE_T_MAX - writer->pos) {
13614 PyErr_NoMemory();
13615 return -1;
13616 }
13617 newlen = writer->pos + length;
13618
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013619 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013620
Victor Stinnerd3f08822012-05-29 12:57:52 +020013621 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013622 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013623 if (writer->overallocate
13624 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13625 /* overallocate to limit the number of realloc() */
13626 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013627 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013628 if (newlen < writer->min_length)
13629 newlen = writer->min_length;
13630
Victor Stinnerd3f08822012-05-29 12:57:52 +020013631 writer->buffer = PyUnicode_New(newlen, maxchar);
13632 if (writer->buffer == NULL)
13633 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013635 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013636 if (writer->overallocate
13637 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13638 /* overallocate to limit the number of realloc() */
13639 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013641 if (newlen < writer->min_length)
13642 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013643
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013644 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013645 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013646 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013647 newbuffer = PyUnicode_New(newlen, maxchar);
13648 if (newbuffer == NULL)
13649 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013650 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13651 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013652 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013653 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013654 }
13655 else {
13656 newbuffer = resize_compact(writer->buffer, newlen);
13657 if (newbuffer == NULL)
13658 return -1;
13659 }
13660 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013661 }
13662 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013663 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013664 newbuffer = PyUnicode_New(writer->size, maxchar);
13665 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013666 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013667 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13668 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013669 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013670 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013671 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013672 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013673
13674#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013675}
13676
Victor Stinnerca9381e2015-09-22 00:58:32 +020013677int
13678_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13679 enum PyUnicode_Kind kind)
13680{
13681 Py_UCS4 maxchar;
13682
13683 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13684 assert(writer->kind < kind);
13685
13686 switch (kind)
13687 {
13688 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13689 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13690 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13691 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013692 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013693 }
13694
13695 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13696}
13697
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013698static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013699_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013700{
Victor Stinner2740e462016-09-06 16:58:36 -070013701 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013702 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13703 return -1;
13704 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13705 writer->pos++;
13706 return 0;
13707}
13708
13709int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013710_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13711{
13712 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13713}
13714
13715int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013716_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13717{
13718 Py_UCS4 maxchar;
13719 Py_ssize_t len;
13720
13721 if (PyUnicode_READY(str) == -1)
13722 return -1;
13723 len = PyUnicode_GET_LENGTH(str);
13724 if (len == 0)
13725 return 0;
13726 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13727 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013728 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013729 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013730 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013731 Py_INCREF(str);
13732 writer->buffer = str;
13733 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013734 writer->pos += len;
13735 return 0;
13736 }
13737 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13738 return -1;
13739 }
13740 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13741 str, 0, len);
13742 writer->pos += len;
13743 return 0;
13744}
13745
Victor Stinnere215d962012-10-06 23:03:36 +020013746int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013747_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13748 Py_ssize_t start, Py_ssize_t end)
13749{
13750 Py_UCS4 maxchar;
13751 Py_ssize_t len;
13752
13753 if (PyUnicode_READY(str) == -1)
13754 return -1;
13755
13756 assert(0 <= start);
13757 assert(end <= PyUnicode_GET_LENGTH(str));
13758 assert(start <= end);
13759
13760 if (end == 0)
13761 return 0;
13762
13763 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13764 return _PyUnicodeWriter_WriteStr(writer, str);
13765
13766 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13767 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13768 else
13769 maxchar = writer->maxchar;
13770 len = end - start;
13771
13772 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13773 return -1;
13774
13775 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13776 str, start, len);
13777 writer->pos += len;
13778 return 0;
13779}
13780
13781int
Victor Stinner4a587072013-11-19 12:54:53 +010013782_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13783 const char *ascii, Py_ssize_t len)
13784{
13785 if (len == -1)
13786 len = strlen(ascii);
13787
13788 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13789
13790 if (writer->buffer == NULL && !writer->overallocate) {
13791 PyObject *str;
13792
13793 str = _PyUnicode_FromASCII(ascii, len);
13794 if (str == NULL)
13795 return -1;
13796
13797 writer->readonly = 1;
13798 writer->buffer = str;
13799 _PyUnicodeWriter_Update(writer);
13800 writer->pos += len;
13801 return 0;
13802 }
13803
13804 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13805 return -1;
13806
13807 switch (writer->kind)
13808 {
13809 case PyUnicode_1BYTE_KIND:
13810 {
13811 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13812 Py_UCS1 *data = writer->data;
13813
Christian Heimesf051e432016-09-13 20:22:02 +020013814 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013815 break;
13816 }
13817 case PyUnicode_2BYTE_KIND:
13818 {
13819 _PyUnicode_CONVERT_BYTES(
13820 Py_UCS1, Py_UCS2,
13821 ascii, ascii + len,
13822 (Py_UCS2 *)writer->data + writer->pos);
13823 break;
13824 }
13825 case PyUnicode_4BYTE_KIND:
13826 {
13827 _PyUnicode_CONVERT_BYTES(
13828 Py_UCS1, Py_UCS4,
13829 ascii, ascii + len,
13830 (Py_UCS4 *)writer->data + writer->pos);
13831 break;
13832 }
13833 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013834 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013835 }
13836
13837 writer->pos += len;
13838 return 0;
13839}
13840
13841int
13842_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13843 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013844{
13845 Py_UCS4 maxchar;
13846
13847 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13848 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13849 return -1;
13850 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13851 writer->pos += len;
13852 return 0;
13853}
13854
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013856_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013857{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013858 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013859
Victor Stinnerd3f08822012-05-29 12:57:52 +020013860 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013861 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013862 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013864
13865 str = writer->buffer;
13866 writer->buffer = NULL;
13867
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013868 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013869 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13870 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013871 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013872
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013873 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13874 PyObject *str2;
13875 str2 = resize_compact(str, writer->pos);
13876 if (str2 == NULL) {
13877 Py_DECREF(str);
13878 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013879 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013880 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013881 }
13882
Victor Stinner15a0bd32013-07-08 22:29:55 +020013883 assert(_PyUnicode_CheckConsistency(str, 1));
13884 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013885}
13886
Victor Stinnerd3f08822012-05-29 12:57:52 +020013887void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013888_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013889{
13890 Py_CLEAR(writer->buffer);
13891}
13892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013893#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013894
13895PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013897\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013898Return a formatted version of S, using substitutions from args and kwargs.\n\
13899The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013900
Eric Smith27bbca62010-11-04 17:06:58 +000013901PyDoc_STRVAR(format_map__doc__,
13902 "S.format_map(mapping) -> str\n\
13903\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013904Return a formatted version of S, using substitutions from mapping.\n\
13905The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013906
INADA Naoki3ae20562017-01-16 20:41:20 +090013907/*[clinic input]
13908str.__format__ as unicode___format__
13909
13910 format_spec: unicode
13911 /
13912
13913Return a formatted version of the string as described by format_spec.
13914[clinic start generated code]*/
13915
Eric Smith4a7d76d2008-05-30 18:10:19 +000013916static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013917unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013918/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013919{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013920 _PyUnicodeWriter writer;
13921 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013922
Victor Stinnerd3f08822012-05-29 12:57:52 +020013923 if (PyUnicode_READY(self) == -1)
13924 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013925 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013926 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13927 self, format_spec, 0,
13928 PyUnicode_GET_LENGTH(format_spec));
13929 if (ret == -1) {
13930 _PyUnicodeWriter_Dealloc(&writer);
13931 return NULL;
13932 }
13933 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013934}
13935
INADA Naoki3ae20562017-01-16 20:41:20 +090013936/*[clinic input]
13937str.__sizeof__ as unicode_sizeof
13938
13939Return the size of the string in memory, in bytes.
13940[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013941
13942static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013943unicode_sizeof_impl(PyObject *self)
13944/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013946 Py_ssize_t size;
13947
13948 /* If it's a compact object, account for base structure +
13949 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013950 if (PyUnicode_IS_COMPACT_ASCII(self))
13951 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13952 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013953 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013954 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013955 else {
13956 /* If it is a two-block object, account for base object, and
13957 for character block if present. */
13958 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013959 if (_PyUnicode_DATA_ANY(self))
13960 size += (PyUnicode_GET_LENGTH(self) + 1) *
13961 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013962 }
13963 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013964 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013965 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13966 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13967 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13968 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013969
13970 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013971}
13972
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013973static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013974unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013975{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013976 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013977 if (!copy)
13978 return NULL;
13979 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013980}
13981
Guido van Rossumd57fd912000-03-10 22:53:23 +000013982static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013983 UNICODE_ENCODE_METHODDEF
13984 UNICODE_REPLACE_METHODDEF
13985 UNICODE_SPLIT_METHODDEF
13986 UNICODE_RSPLIT_METHODDEF
13987 UNICODE_JOIN_METHODDEF
13988 UNICODE_CAPITALIZE_METHODDEF
13989 UNICODE_CASEFOLD_METHODDEF
13990 UNICODE_TITLE_METHODDEF
13991 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013992 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013993 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013994 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013995 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013996 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013997 UNICODE_LJUST_METHODDEF
13998 UNICODE_LOWER_METHODDEF
13999 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014000 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14001 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014002 UNICODE_RJUST_METHODDEF
14003 UNICODE_RSTRIP_METHODDEF
14004 UNICODE_RPARTITION_METHODDEF
14005 UNICODE_SPLITLINES_METHODDEF
14006 UNICODE_STRIP_METHODDEF
14007 UNICODE_SWAPCASE_METHODDEF
14008 UNICODE_TRANSLATE_METHODDEF
14009 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014010 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14011 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014012 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014013 UNICODE_ISLOWER_METHODDEF
14014 UNICODE_ISUPPER_METHODDEF
14015 UNICODE_ISTITLE_METHODDEF
14016 UNICODE_ISSPACE_METHODDEF
14017 UNICODE_ISDECIMAL_METHODDEF
14018 UNICODE_ISDIGIT_METHODDEF
14019 UNICODE_ISNUMERIC_METHODDEF
14020 UNICODE_ISALPHA_METHODDEF
14021 UNICODE_ISALNUM_METHODDEF
14022 UNICODE_ISIDENTIFIER_METHODDEF
14023 UNICODE_ISPRINTABLE_METHODDEF
14024 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014025 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014026 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014027 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014028 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014029 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014030#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014031 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014032 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014033#endif
14034
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014035 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036 {NULL, NULL}
14037};
14038
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014039static PyObject *
14040unicode_mod(PyObject *v, PyObject *w)
14041{
Brian Curtindfc80e32011-08-10 20:28:54 -050014042 if (!PyUnicode_Check(v))
14043 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014044 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014045}
14046
14047static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014048 0, /*nb_add*/
14049 0, /*nb_subtract*/
14050 0, /*nb_multiply*/
14051 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014052};
14053
Guido van Rossumd57fd912000-03-10 22:53:23 +000014054static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 (lenfunc) unicode_length, /* sq_length */
14056 PyUnicode_Concat, /* sq_concat */
14057 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14058 (ssizeargfunc) unicode_getitem, /* sq_item */
14059 0, /* sq_slice */
14060 0, /* sq_ass_item */
14061 0, /* sq_ass_slice */
14062 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014063};
14064
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014065static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014066unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014068 if (PyUnicode_READY(self) == -1)
14069 return NULL;
14070
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014071 if (PyIndex_Check(item)) {
14072 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014073 if (i == -1 && PyErr_Occurred())
14074 return NULL;
14075 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014076 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014077 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014078 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014079 Py_ssize_t start, stop, step, slicelength, i;
14080 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014081 PyObject *result;
14082 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014083 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014084 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014085
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014086 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014087 return NULL;
14088 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014089 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14090 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014091
14092 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014093 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014094 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014095 slicelength == PyUnicode_GET_LENGTH(self)) {
14096 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014097 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014098 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014099 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014100 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014101 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014102 src_kind = PyUnicode_KIND(self);
14103 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014104 if (!PyUnicode_IS_ASCII(self)) {
14105 kind_limit = kind_maxchar_limit(src_kind);
14106 max_char = 0;
14107 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14108 ch = PyUnicode_READ(src_kind, src_data, cur);
14109 if (ch > max_char) {
14110 max_char = ch;
14111 if (max_char >= kind_limit)
14112 break;
14113 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014114 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014115 }
Victor Stinner55c99112011-10-13 01:17:06 +020014116 else
14117 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014118 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014119 if (result == NULL)
14120 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014121 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014122 dest_data = PyUnicode_DATA(result);
14123
14124 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014125 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14126 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014127 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014128 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014129 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014130 } else {
14131 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14132 return NULL;
14133 }
14134}
14135
14136static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014137 (lenfunc)unicode_length, /* mp_length */
14138 (binaryfunc)unicode_subscript, /* mp_subscript */
14139 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014140};
14141
Guido van Rossumd57fd912000-03-10 22:53:23 +000014142
Guido van Rossumd57fd912000-03-10 22:53:23 +000014143/* Helpers for PyUnicode_Format() */
14144
Victor Stinnera47082312012-10-04 02:19:54 +020014145struct unicode_formatter_t {
14146 PyObject *args;
14147 int args_owned;
14148 Py_ssize_t arglen, argidx;
14149 PyObject *dict;
14150
14151 enum PyUnicode_Kind fmtkind;
14152 Py_ssize_t fmtcnt, fmtpos;
14153 void *fmtdata;
14154 PyObject *fmtstr;
14155
14156 _PyUnicodeWriter writer;
14157};
14158
14159struct unicode_format_arg_t {
14160 Py_UCS4 ch;
14161 int flags;
14162 Py_ssize_t width;
14163 int prec;
14164 int sign;
14165};
14166
Guido van Rossumd57fd912000-03-10 22:53:23 +000014167static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014168unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014169{
Victor Stinnera47082312012-10-04 02:19:54 +020014170 Py_ssize_t argidx = ctx->argidx;
14171
14172 if (argidx < ctx->arglen) {
14173 ctx->argidx++;
14174 if (ctx->arglen < 0)
14175 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014176 else
Victor Stinnera47082312012-10-04 02:19:54 +020014177 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014178 }
14179 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014180 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181 return NULL;
14182}
14183
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014184/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185
Victor Stinnera47082312012-10-04 02:19:54 +020014186/* Format a float into the writer if the writer is not NULL, or into *p_output
14187 otherwise.
14188
14189 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014190static int
Victor Stinnera47082312012-10-04 02:19:54 +020014191formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14192 PyObject **p_output,
14193 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014195 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014197 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014198 int prec;
14199 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014200
Guido van Rossumd57fd912000-03-10 22:53:23 +000014201 x = PyFloat_AsDouble(v);
14202 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014203 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014204
Victor Stinnera47082312012-10-04 02:19:54 +020014205 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014207 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014208
Victor Stinnera47082312012-10-04 02:19:54 +020014209 if (arg->flags & F_ALT)
14210 dtoa_flags = Py_DTSF_ALT;
14211 else
14212 dtoa_flags = 0;
14213 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014214 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014215 return -1;
14216 len = strlen(p);
14217 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014218 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014219 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014220 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014221 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014222 }
14223 else
14224 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014225 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014226 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014227}
14228
Victor Stinnerd0880d52012-04-27 23:40:13 +020014229/* formatlong() emulates the format codes d, u, o, x and X, and
14230 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14231 * Python's regular ints.
14232 * Return value: a new PyUnicodeObject*, or NULL if error.
14233 * The output string is of the form
14234 * "-"? ("0x" | "0X")? digit+
14235 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14236 * set in flags. The case of hex digits will be correct,
14237 * There will be at least prec digits, zero-filled on the left if
14238 * necessary to get that many.
14239 * val object to be converted
14240 * flags bitmask of format flags; only F_ALT is looked at
14241 * prec minimum number of digits; 0-fill on left if needed
14242 * type a character in [duoxX]; u acts the same as d
14243 *
14244 * CAUTION: o, x and X conversions on regular ints can never
14245 * produce a '-' sign, but can for Python's unbounded ints.
14246 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014247PyObject *
14248_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014249{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014250 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014251 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014252 Py_ssize_t i;
14253 int sign; /* 1 if '-', else 0 */
14254 int len; /* number of characters */
14255 Py_ssize_t llen;
14256 int numdigits; /* len == numnondigits + numdigits */
14257 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014258
Victor Stinnerd0880d52012-04-27 23:40:13 +020014259 /* Avoid exceeding SSIZE_T_MAX */
14260 if (prec > INT_MAX-3) {
14261 PyErr_SetString(PyExc_OverflowError,
14262 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014263 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014264 }
14265
14266 assert(PyLong_Check(val));
14267
14268 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014269 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014270 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014271 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014272 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014273 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014274 /* int and int subclasses should print numerically when a numeric */
14275 /* format code is used (see issue18780) */
14276 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014277 break;
14278 case 'o':
14279 numnondigits = 2;
14280 result = PyNumber_ToBase(val, 8);
14281 break;
14282 case 'x':
14283 case 'X':
14284 numnondigits = 2;
14285 result = PyNumber_ToBase(val, 16);
14286 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014287 }
14288 if (!result)
14289 return NULL;
14290
14291 assert(unicode_modifiable(result));
14292 assert(PyUnicode_IS_READY(result));
14293 assert(PyUnicode_IS_ASCII(result));
14294
14295 /* To modify the string in-place, there can only be one reference. */
14296 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014297 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014298 PyErr_BadInternalCall();
14299 return NULL;
14300 }
14301 buf = PyUnicode_DATA(result);
14302 llen = PyUnicode_GET_LENGTH(result);
14303 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014304 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014305 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014306 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014307 return NULL;
14308 }
14309 len = (int)llen;
14310 sign = buf[0] == '-';
14311 numnondigits += sign;
14312 numdigits = len - numnondigits;
14313 assert(numdigits > 0);
14314
14315 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014316 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014317 (type == 'o' || type == 'x' || type == 'X'))) {
14318 assert(buf[sign] == '0');
14319 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14320 buf[sign+1] == 'o');
14321 numnondigits -= 2;
14322 buf += 2;
14323 len -= 2;
14324 if (sign)
14325 buf[0] = '-';
14326 assert(len == numnondigits + numdigits);
14327 assert(numdigits > 0);
14328 }
14329
14330 /* Fill with leading zeroes to meet minimum width. */
14331 if (prec > numdigits) {
14332 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14333 numnondigits + prec);
14334 char *b1;
14335 if (!r1) {
14336 Py_DECREF(result);
14337 return NULL;
14338 }
14339 b1 = PyBytes_AS_STRING(r1);
14340 for (i = 0; i < numnondigits; ++i)
14341 *b1++ = *buf++;
14342 for (i = 0; i < prec - numdigits; i++)
14343 *b1++ = '0';
14344 for (i = 0; i < numdigits; i++)
14345 *b1++ = *buf++;
14346 *b1 = '\0';
14347 Py_DECREF(result);
14348 result = r1;
14349 buf = PyBytes_AS_STRING(result);
14350 len = numnondigits + prec;
14351 }
14352
14353 /* Fix up case for hex conversions. */
14354 if (type == 'X') {
14355 /* Need to convert all lower case letters to upper case.
14356 and need to convert 0x to 0X (and -0x to -0X). */
14357 for (i = 0; i < len; i++)
14358 if (buf[i] >= 'a' && buf[i] <= 'x')
14359 buf[i] -= 'a'-'A';
14360 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014361 if (!PyUnicode_Check(result)
14362 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014363 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014364 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014365 Py_DECREF(result);
14366 result = unicode;
14367 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 else if (len != PyUnicode_GET_LENGTH(result)) {
14369 if (PyUnicode_Resize(&result, len) < 0)
14370 Py_CLEAR(result);
14371 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014372 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014373}
14374
Ethan Furmandf3ed242014-01-05 06:50:30 -080014375/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014376 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014377 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014378 * -1 and raise an exception on error */
14379static int
Victor Stinnera47082312012-10-04 02:19:54 +020014380mainformatlong(PyObject *v,
14381 struct unicode_format_arg_t *arg,
14382 PyObject **p_output,
14383 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014384{
14385 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014386 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014387
14388 if (!PyNumber_Check(v))
14389 goto wrongtype;
14390
Ethan Furman9ab74802014-03-21 06:38:46 -070014391 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014392 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393 if (type == 'o' || type == 'x' || type == 'X') {
14394 iobj = PyNumber_Index(v);
14395 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014396 if (PyErr_ExceptionMatches(PyExc_TypeError))
14397 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014398 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014399 }
14400 }
14401 else {
14402 iobj = PyNumber_Long(v);
14403 if (iobj == NULL ) {
14404 if (PyErr_ExceptionMatches(PyExc_TypeError))
14405 goto wrongtype;
14406 return -1;
14407 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014408 }
14409 assert(PyLong_Check(iobj));
14410 }
14411 else {
14412 iobj = v;
14413 Py_INCREF(iobj);
14414 }
14415
14416 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014417 && arg->width == -1 && arg->prec == -1
14418 && !(arg->flags & (F_SIGN | F_BLANK))
14419 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014420 {
14421 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014422 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014423 int base;
14424
Victor Stinnera47082312012-10-04 02:19:54 +020014425 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014426 {
14427 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014428 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014429 case 'd':
14430 case 'i':
14431 case 'u':
14432 base = 10;
14433 break;
14434 case 'o':
14435 base = 8;
14436 break;
14437 case 'x':
14438 case 'X':
14439 base = 16;
14440 break;
14441 }
14442
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014443 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14444 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014445 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014446 }
14447 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014448 return 1;
14449 }
14450
Ethan Furmanb95b5612015-01-23 20:05:18 -080014451 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014452 Py_DECREF(iobj);
14453 if (res == NULL)
14454 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014455 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014456 return 0;
14457
14458wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014459 switch(type)
14460 {
14461 case 'o':
14462 case 'x':
14463 case 'X':
14464 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014465 "%%%c format: an integer is required, "
14466 "not %.200s",
14467 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014468 break;
14469 default:
14470 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014471 "%%%c format: a number is required, "
14472 "not %.200s",
14473 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014474 break;
14475 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014476 return -1;
14477}
14478
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014479static Py_UCS4
14480formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014481{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014482 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014483 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014484 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014485 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014486 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014487 goto onError;
14488 }
14489 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014490 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014491 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014492 /* make sure number is a type of integer */
14493 if (!PyLong_Check(v)) {
14494 iobj = PyNumber_Index(v);
14495 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014496 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014497 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014498 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014499 Py_DECREF(iobj);
14500 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014501 else {
14502 x = PyLong_AsLong(v);
14503 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014504 if (x == -1 && PyErr_Occurred())
14505 goto onError;
14506
Victor Stinner8faf8212011-12-08 22:14:11 +010014507 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014508 PyErr_SetString(PyExc_OverflowError,
14509 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014510 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014511 }
14512
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014513 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014515
Benjamin Peterson29060642009-01-31 22:14:21 +000014516 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014517 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014518 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014519 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014520}
14521
Victor Stinnera47082312012-10-04 02:19:54 +020014522/* Parse options of an argument: flags, width, precision.
14523 Handle also "%(name)" syntax.
14524
14525 Return 0 if the argument has been formatted into arg->str.
14526 Return 1 if the argument has been written into ctx->writer,
14527 Raise an exception and return -1 on error. */
14528static int
14529unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14530 struct unicode_format_arg_t *arg)
14531{
14532#define FORMAT_READ(ctx) \
14533 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14534
14535 PyObject *v;
14536
Victor Stinnera47082312012-10-04 02:19:54 +020014537 if (arg->ch == '(') {
14538 /* Get argument value from a dictionary. Example: "%(name)s". */
14539 Py_ssize_t keystart;
14540 Py_ssize_t keylen;
14541 PyObject *key;
14542 int pcount = 1;
14543
14544 if (ctx->dict == NULL) {
14545 PyErr_SetString(PyExc_TypeError,
14546 "format requires a mapping");
14547 return -1;
14548 }
14549 ++ctx->fmtpos;
14550 --ctx->fmtcnt;
14551 keystart = ctx->fmtpos;
14552 /* Skip over balanced parentheses */
14553 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14554 arg->ch = FORMAT_READ(ctx);
14555 if (arg->ch == ')')
14556 --pcount;
14557 else if (arg->ch == '(')
14558 ++pcount;
14559 ctx->fmtpos++;
14560 }
14561 keylen = ctx->fmtpos - keystart - 1;
14562 if (ctx->fmtcnt < 0 || pcount > 0) {
14563 PyErr_SetString(PyExc_ValueError,
14564 "incomplete format key");
14565 return -1;
14566 }
14567 key = PyUnicode_Substring(ctx->fmtstr,
14568 keystart, keystart + keylen);
14569 if (key == NULL)
14570 return -1;
14571 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014572 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014573 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014574 }
14575 ctx->args = PyObject_GetItem(ctx->dict, key);
14576 Py_DECREF(key);
14577 if (ctx->args == NULL)
14578 return -1;
14579 ctx->args_owned = 1;
14580 ctx->arglen = -1;
14581 ctx->argidx = -2;
14582 }
14583
14584 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014585 while (--ctx->fmtcnt >= 0) {
14586 arg->ch = FORMAT_READ(ctx);
14587 ctx->fmtpos++;
14588 switch (arg->ch) {
14589 case '-': arg->flags |= F_LJUST; continue;
14590 case '+': arg->flags |= F_SIGN; continue;
14591 case ' ': arg->flags |= F_BLANK; continue;
14592 case '#': arg->flags |= F_ALT; continue;
14593 case '0': arg->flags |= F_ZERO; continue;
14594 }
14595 break;
14596 }
14597
14598 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014599 if (arg->ch == '*') {
14600 v = unicode_format_getnextarg(ctx);
14601 if (v == NULL)
14602 return -1;
14603 if (!PyLong_Check(v)) {
14604 PyErr_SetString(PyExc_TypeError,
14605 "* wants int");
14606 return -1;
14607 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014608 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014609 if (arg->width == -1 && PyErr_Occurred())
14610 return -1;
14611 if (arg->width < 0) {
14612 arg->flags |= F_LJUST;
14613 arg->width = -arg->width;
14614 }
14615 if (--ctx->fmtcnt >= 0) {
14616 arg->ch = FORMAT_READ(ctx);
14617 ctx->fmtpos++;
14618 }
14619 }
14620 else if (arg->ch >= '0' && arg->ch <= '9') {
14621 arg->width = arg->ch - '0';
14622 while (--ctx->fmtcnt >= 0) {
14623 arg->ch = FORMAT_READ(ctx);
14624 ctx->fmtpos++;
14625 if (arg->ch < '0' || arg->ch > '9')
14626 break;
14627 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14628 mixing signed and unsigned comparison. Since arg->ch is between
14629 '0' and '9', casting to int is safe. */
14630 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14631 PyErr_SetString(PyExc_ValueError,
14632 "width too big");
14633 return -1;
14634 }
14635 arg->width = arg->width*10 + (arg->ch - '0');
14636 }
14637 }
14638
14639 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014640 if (arg->ch == '.') {
14641 arg->prec = 0;
14642 if (--ctx->fmtcnt >= 0) {
14643 arg->ch = FORMAT_READ(ctx);
14644 ctx->fmtpos++;
14645 }
14646 if (arg->ch == '*') {
14647 v = unicode_format_getnextarg(ctx);
14648 if (v == NULL)
14649 return -1;
14650 if (!PyLong_Check(v)) {
14651 PyErr_SetString(PyExc_TypeError,
14652 "* wants int");
14653 return -1;
14654 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014655 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014656 if (arg->prec == -1 && PyErr_Occurred())
14657 return -1;
14658 if (arg->prec < 0)
14659 arg->prec = 0;
14660 if (--ctx->fmtcnt >= 0) {
14661 arg->ch = FORMAT_READ(ctx);
14662 ctx->fmtpos++;
14663 }
14664 }
14665 else if (arg->ch >= '0' && arg->ch <= '9') {
14666 arg->prec = arg->ch - '0';
14667 while (--ctx->fmtcnt >= 0) {
14668 arg->ch = FORMAT_READ(ctx);
14669 ctx->fmtpos++;
14670 if (arg->ch < '0' || arg->ch > '9')
14671 break;
14672 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14673 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014674 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014675 return -1;
14676 }
14677 arg->prec = arg->prec*10 + (arg->ch - '0');
14678 }
14679 }
14680 }
14681
14682 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14683 if (ctx->fmtcnt >= 0) {
14684 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14685 if (--ctx->fmtcnt >= 0) {
14686 arg->ch = FORMAT_READ(ctx);
14687 ctx->fmtpos++;
14688 }
14689 }
14690 }
14691 if (ctx->fmtcnt < 0) {
14692 PyErr_SetString(PyExc_ValueError,
14693 "incomplete format");
14694 return -1;
14695 }
14696 return 0;
14697
14698#undef FORMAT_READ
14699}
14700
14701/* Format one argument. Supported conversion specifiers:
14702
14703 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014704 - "i", "d", "u": int or float
14705 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014706 - "e", "E", "f", "F", "g", "G": float
14707 - "c": int or str (1 character)
14708
Victor Stinner8dbd4212012-12-04 09:30:24 +010014709 When possible, the output is written directly into the Unicode writer
14710 (ctx->writer). A string is created when padding is required.
14711
Victor Stinnera47082312012-10-04 02:19:54 +020014712 Return 0 if the argument has been formatted into *p_str,
14713 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014714 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014715static int
14716unicode_format_arg_format(struct unicode_formatter_t *ctx,
14717 struct unicode_format_arg_t *arg,
14718 PyObject **p_str)
14719{
14720 PyObject *v;
14721 _PyUnicodeWriter *writer = &ctx->writer;
14722
14723 if (ctx->fmtcnt == 0)
14724 ctx->writer.overallocate = 0;
14725
Victor Stinnera47082312012-10-04 02:19:54 +020014726 v = unicode_format_getnextarg(ctx);
14727 if (v == NULL)
14728 return -1;
14729
Victor Stinnera47082312012-10-04 02:19:54 +020014730
14731 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014732 case 's':
14733 case 'r':
14734 case 'a':
14735 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14736 /* Fast path */
14737 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14738 return -1;
14739 return 1;
14740 }
14741
14742 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14743 *p_str = v;
14744 Py_INCREF(*p_str);
14745 }
14746 else {
14747 if (arg->ch == 's')
14748 *p_str = PyObject_Str(v);
14749 else if (arg->ch == 'r')
14750 *p_str = PyObject_Repr(v);
14751 else
14752 *p_str = PyObject_ASCII(v);
14753 }
14754 break;
14755
14756 case 'i':
14757 case 'd':
14758 case 'u':
14759 case 'o':
14760 case 'x':
14761 case 'X':
14762 {
14763 int ret = mainformatlong(v, arg, p_str, writer);
14764 if (ret != 0)
14765 return ret;
14766 arg->sign = 1;
14767 break;
14768 }
14769
14770 case 'e':
14771 case 'E':
14772 case 'f':
14773 case 'F':
14774 case 'g':
14775 case 'G':
14776 if (arg->width == -1 && arg->prec == -1
14777 && !(arg->flags & (F_SIGN | F_BLANK)))
14778 {
14779 /* Fast path */
14780 if (formatfloat(v, arg, NULL, writer) == -1)
14781 return -1;
14782 return 1;
14783 }
14784
14785 arg->sign = 1;
14786 if (formatfloat(v, arg, p_str, NULL) == -1)
14787 return -1;
14788 break;
14789
14790 case 'c':
14791 {
14792 Py_UCS4 ch = formatchar(v);
14793 if (ch == (Py_UCS4) -1)
14794 return -1;
14795 if (arg->width == -1 && arg->prec == -1) {
14796 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014797 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014798 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014799 return 1;
14800 }
14801 *p_str = PyUnicode_FromOrdinal(ch);
14802 break;
14803 }
14804
14805 default:
14806 PyErr_Format(PyExc_ValueError,
14807 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014808 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014809 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14810 (int)arg->ch,
14811 ctx->fmtpos - 1);
14812 return -1;
14813 }
14814 if (*p_str == NULL)
14815 return -1;
14816 assert (PyUnicode_Check(*p_str));
14817 return 0;
14818}
14819
14820static int
14821unicode_format_arg_output(struct unicode_formatter_t *ctx,
14822 struct unicode_format_arg_t *arg,
14823 PyObject *str)
14824{
14825 Py_ssize_t len;
14826 enum PyUnicode_Kind kind;
14827 void *pbuf;
14828 Py_ssize_t pindex;
14829 Py_UCS4 signchar;
14830 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014831 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014832 Py_ssize_t sublen;
14833 _PyUnicodeWriter *writer = &ctx->writer;
14834 Py_UCS4 fill;
14835
14836 fill = ' ';
14837 if (arg->sign && arg->flags & F_ZERO)
14838 fill = '0';
14839
14840 if (PyUnicode_READY(str) == -1)
14841 return -1;
14842
14843 len = PyUnicode_GET_LENGTH(str);
14844 if ((arg->width == -1 || arg->width <= len)
14845 && (arg->prec == -1 || arg->prec >= len)
14846 && !(arg->flags & (F_SIGN | F_BLANK)))
14847 {
14848 /* Fast path */
14849 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14850 return -1;
14851 return 0;
14852 }
14853
14854 /* Truncate the string for "s", "r" and "a" formats
14855 if the precision is set */
14856 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14857 if (arg->prec >= 0 && len > arg->prec)
14858 len = arg->prec;
14859 }
14860
14861 /* Adjust sign and width */
14862 kind = PyUnicode_KIND(str);
14863 pbuf = PyUnicode_DATA(str);
14864 pindex = 0;
14865 signchar = '\0';
14866 if (arg->sign) {
14867 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14868 if (ch == '-' || ch == '+') {
14869 signchar = ch;
14870 len--;
14871 pindex++;
14872 }
14873 else if (arg->flags & F_SIGN)
14874 signchar = '+';
14875 else if (arg->flags & F_BLANK)
14876 signchar = ' ';
14877 else
14878 arg->sign = 0;
14879 }
14880 if (arg->width < len)
14881 arg->width = len;
14882
14883 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014884 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014885 if (!(arg->flags & F_LJUST)) {
14886 if (arg->sign) {
14887 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014888 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014889 }
14890 else {
14891 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014892 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014893 }
14894 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014895 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14896 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014897 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014898 }
14899
Victor Stinnera47082312012-10-04 02:19:54 +020014900 buflen = arg->width;
14901 if (arg->sign && len == arg->width)
14902 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014903 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014904 return -1;
14905
14906 /* Write the sign if needed */
14907 if (arg->sign) {
14908 if (fill != ' ') {
14909 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14910 writer->pos += 1;
14911 }
14912 if (arg->width > len)
14913 arg->width--;
14914 }
14915
14916 /* Write the numeric prefix for "x", "X" and "o" formats
14917 if the alternate form is used.
14918 For example, write "0x" for the "%#x" format. */
14919 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14920 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14921 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14922 if (fill != ' ') {
14923 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14924 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14925 writer->pos += 2;
14926 pindex += 2;
14927 }
14928 arg->width -= 2;
14929 if (arg->width < 0)
14930 arg->width = 0;
14931 len -= 2;
14932 }
14933
14934 /* Pad left with the fill character if needed */
14935 if (arg->width > len && !(arg->flags & F_LJUST)) {
14936 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014937 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014938 writer->pos += sublen;
14939 arg->width = len;
14940 }
14941
14942 /* If padding with spaces: write sign if needed and/or numeric prefix if
14943 the alternate form is used */
14944 if (fill == ' ') {
14945 if (arg->sign) {
14946 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14947 writer->pos += 1;
14948 }
14949 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14950 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14951 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14952 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14953 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14954 writer->pos += 2;
14955 pindex += 2;
14956 }
14957 }
14958
14959 /* Write characters */
14960 if (len) {
14961 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14962 str, pindex, len);
14963 writer->pos += len;
14964 }
14965
14966 /* Pad right with the fill character if needed */
14967 if (arg->width > len) {
14968 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014969 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014970 writer->pos += sublen;
14971 }
14972 return 0;
14973}
14974
14975/* Helper of PyUnicode_Format(): format one arg.
14976 Return 0 on success, raise an exception and return -1 on error. */
14977static int
14978unicode_format_arg(struct unicode_formatter_t *ctx)
14979{
14980 struct unicode_format_arg_t arg;
14981 PyObject *str;
14982 int ret;
14983
Victor Stinner8dbd4212012-12-04 09:30:24 +010014984 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014985 if (arg.ch == '%') {
14986 ctx->fmtpos++;
14987 ctx->fmtcnt--;
14988 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14989 return -1;
14990 return 0;
14991 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014992 arg.flags = 0;
14993 arg.width = -1;
14994 arg.prec = -1;
14995 arg.sign = 0;
14996 str = NULL;
14997
Victor Stinnera47082312012-10-04 02:19:54 +020014998 ret = unicode_format_arg_parse(ctx, &arg);
14999 if (ret == -1)
15000 return -1;
15001
15002 ret = unicode_format_arg_format(ctx, &arg, &str);
15003 if (ret == -1)
15004 return -1;
15005
15006 if (ret != 1) {
15007 ret = unicode_format_arg_output(ctx, &arg, str);
15008 Py_DECREF(str);
15009 if (ret == -1)
15010 return -1;
15011 }
15012
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015013 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015014 PyErr_SetString(PyExc_TypeError,
15015 "not all arguments converted during string formatting");
15016 return -1;
15017 }
15018 return 0;
15019}
15020
Alexander Belopolsky40018472011-02-26 01:02:56 +000015021PyObject *
15022PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023{
Victor Stinnera47082312012-10-04 02:19:54 +020015024 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015025
Guido van Rossumd57fd912000-03-10 22:53:23 +000015026 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015027 PyErr_BadInternalCall();
15028 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015029 }
Victor Stinnera47082312012-10-04 02:19:54 +020015030
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015031 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015032 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015033
15034 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015035 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15036 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15037 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15038 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015039
Victor Stinner8f674cc2013-04-17 23:02:17 +020015040 _PyUnicodeWriter_Init(&ctx.writer);
15041 ctx.writer.min_length = ctx.fmtcnt + 100;
15042 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015043
Guido van Rossumd57fd912000-03-10 22:53:23 +000015044 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015045 ctx.arglen = PyTuple_Size(args);
15046 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015047 }
15048 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015049 ctx.arglen = -1;
15050 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015051 }
Victor Stinnera47082312012-10-04 02:19:54 +020015052 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015053 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015054 ctx.dict = args;
15055 else
15056 ctx.dict = NULL;
15057 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015058
Victor Stinnera47082312012-10-04 02:19:54 +020015059 while (--ctx.fmtcnt >= 0) {
15060 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015061 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015062
15063 nonfmtpos = ctx.fmtpos++;
15064 while (ctx.fmtcnt >= 0 &&
15065 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15066 ctx.fmtpos++;
15067 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 }
Victor Stinnera47082312012-10-04 02:19:54 +020015069 if (ctx.fmtcnt < 0) {
15070 ctx.fmtpos--;
15071 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015072 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015073
Victor Stinnercfc4c132013-04-03 01:48:39 +020015074 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15075 nonfmtpos, ctx.fmtpos) < 0)
15076 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 }
15078 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015079 ctx.fmtpos++;
15080 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015081 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015082 }
15083 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015084
Victor Stinnera47082312012-10-04 02:19:54 +020015085 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015086 PyErr_SetString(PyExc_TypeError,
15087 "not all arguments converted during string formatting");
15088 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015089 }
15090
Victor Stinnera47082312012-10-04 02:19:54 +020015091 if (ctx.args_owned) {
15092 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015093 }
Victor Stinnera47082312012-10-04 02:19:54 +020015094 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015095
Benjamin Peterson29060642009-01-31 22:14:21 +000015096 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015097 _PyUnicodeWriter_Dealloc(&ctx.writer);
15098 if (ctx.args_owned) {
15099 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015100 }
15101 return NULL;
15102}
15103
Jeremy Hylton938ace62002-07-17 16:30:39 +000015104static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015105unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15106
Tim Peters6d6c1a32001-08-02 04:15:00 +000015107static PyObject *
15108unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15109{
Benjamin Peterson29060642009-01-31 22:14:21 +000015110 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 static char *kwlist[] = {"object", "encoding", "errors", 0};
15112 char *encoding = NULL;
15113 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015114
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 if (type != &PyUnicode_Type)
15116 return unicode_subtype_new(type, args, kwds);
15117 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015118 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 return NULL;
15120 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015121 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 if (encoding == NULL && errors == NULL)
15123 return PyObject_Str(x);
15124 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015125 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015126}
15127
Guido van Rossume023fe02001-08-30 03:12:59 +000015128static PyObject *
15129unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15130{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015131 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015132 Py_ssize_t length, char_size;
15133 int share_wstr, share_utf8;
15134 unsigned int kind;
15135 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015136
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015138
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015139 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015140 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015142 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015143 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015144 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015145 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015146 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015147
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015148 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015149 if (self == NULL) {
15150 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 return NULL;
15152 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015153 kind = PyUnicode_KIND(unicode);
15154 length = PyUnicode_GET_LENGTH(unicode);
15155
15156 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015157#ifdef Py_DEBUG
15158 _PyUnicode_HASH(self) = -1;
15159#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015160 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015161#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015162 _PyUnicode_STATE(self).interned = 0;
15163 _PyUnicode_STATE(self).kind = kind;
15164 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015165 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015166 _PyUnicode_STATE(self).ready = 1;
15167 _PyUnicode_WSTR(self) = NULL;
15168 _PyUnicode_UTF8_LENGTH(self) = 0;
15169 _PyUnicode_UTF8(self) = NULL;
15170 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015171 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015172
15173 share_utf8 = 0;
15174 share_wstr = 0;
15175 if (kind == PyUnicode_1BYTE_KIND) {
15176 char_size = 1;
15177 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15178 share_utf8 = 1;
15179 }
15180 else if (kind == PyUnicode_2BYTE_KIND) {
15181 char_size = 2;
15182 if (sizeof(wchar_t) == 2)
15183 share_wstr = 1;
15184 }
15185 else {
15186 assert(kind == PyUnicode_4BYTE_KIND);
15187 char_size = 4;
15188 if (sizeof(wchar_t) == 4)
15189 share_wstr = 1;
15190 }
15191
15192 /* Ensure we won't overflow the length. */
15193 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15194 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015195 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015197 data = PyObject_MALLOC((length + 1) * char_size);
15198 if (data == NULL) {
15199 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015200 goto onError;
15201 }
15202
Victor Stinnerc3c74152011-10-02 20:39:55 +020015203 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015204 if (share_utf8) {
15205 _PyUnicode_UTF8_LENGTH(self) = length;
15206 _PyUnicode_UTF8(self) = data;
15207 }
15208 if (share_wstr) {
15209 _PyUnicode_WSTR_LENGTH(self) = length;
15210 _PyUnicode_WSTR(self) = (wchar_t *)data;
15211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015212
Christian Heimesf051e432016-09-13 20:22:02 +020015213 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015214 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015215 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015216#ifdef Py_DEBUG
15217 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15218#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015219 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015220 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015221
15222onError:
15223 Py_DECREF(unicode);
15224 Py_DECREF(self);
15225 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015226}
15227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015228PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015229"str(object='') -> str\n\
15230str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015231\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015232Create a new string object from the given object. If encoding or\n\
15233errors is specified, then the object must expose a data buffer\n\
15234that will be decoded using the given encoding and error handler.\n\
15235Otherwise, returns the result of object.__str__() (if defined)\n\
15236or repr(object).\n\
15237encoding defaults to sys.getdefaultencoding().\n\
15238errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015239
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015240static PyObject *unicode_iter(PyObject *seq);
15241
Guido van Rossumd57fd912000-03-10 22:53:23 +000015242PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015243 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015244 "str", /* tp_name */
15245 sizeof(PyUnicodeObject), /* tp_basicsize */
15246 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015247 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015248 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015249 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015250 0, /* tp_getattr */
15251 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015252 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015253 unicode_repr, /* tp_repr */
15254 &unicode_as_number, /* tp_as_number */
15255 &unicode_as_sequence, /* tp_as_sequence */
15256 &unicode_as_mapping, /* tp_as_mapping */
15257 (hashfunc) unicode_hash, /* tp_hash*/
15258 0, /* tp_call*/
15259 (reprfunc) unicode_str, /* tp_str */
15260 PyObject_GenericGetAttr, /* tp_getattro */
15261 0, /* tp_setattro */
15262 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015263 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015264 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15265 unicode_doc, /* tp_doc */
15266 0, /* tp_traverse */
15267 0, /* tp_clear */
15268 PyUnicode_RichCompare, /* tp_richcompare */
15269 0, /* tp_weaklistoffset */
15270 unicode_iter, /* tp_iter */
15271 0, /* tp_iternext */
15272 unicode_methods, /* tp_methods */
15273 0, /* tp_members */
15274 0, /* tp_getset */
15275 &PyBaseObject_Type, /* tp_base */
15276 0, /* tp_dict */
15277 0, /* tp_descr_get */
15278 0, /* tp_descr_set */
15279 0, /* tp_dictoffset */
15280 0, /* tp_init */
15281 0, /* tp_alloc */
15282 unicode_new, /* tp_new */
15283 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015284};
15285
15286/* Initialize the Unicode implementation */
15287
Victor Stinner331a6a52019-05-27 16:39:22 +020015288PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015289_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015290{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015291 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015292 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015293 0x000A, /* LINE FEED */
15294 0x000D, /* CARRIAGE RETURN */
15295 0x001C, /* FILE SEPARATOR */
15296 0x001D, /* GROUP SEPARATOR */
15297 0x001E, /* RECORD SEPARATOR */
15298 0x0085, /* NEXT LINE */
15299 0x2028, /* LINE SEPARATOR */
15300 0x2029, /* PARAGRAPH SEPARATOR */
15301 };
15302
Fred Drakee4315f52000-05-09 19:53:39 +000015303 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015304 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015305 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015306 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015307 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015308 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015309
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015310 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015311 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015312 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015313
15314 /* initialize the linebreak bloom filter */
15315 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015316 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015317 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015318
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015319 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015320 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015321 }
15322 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015323 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015324 }
15325 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015326 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015327 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015328 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015329}
15330
15331/* Finalize the Unicode implementation */
15332
Christian Heimesa156e092008-02-16 07:38:31 +000015333int
15334PyUnicode_ClearFreeList(void)
15335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015336 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015337}
15338
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015339
Walter Dörwald16807132007-05-25 13:52:07 +000015340void
15341PyUnicode_InternInPlace(PyObject **p)
15342{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015343 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015345#ifdef Py_DEBUG
15346 assert(s != NULL);
15347 assert(_PyUnicode_CHECK(s));
15348#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015350 return;
15351#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 /* If it's a subclass, we don't really know what putting
15353 it in the interned dict might do. */
15354 if (!PyUnicode_CheckExact(s))
15355 return;
15356 if (PyUnicode_CHECK_INTERNED(s))
15357 return;
15358 if (interned == NULL) {
15359 interned = PyDict_New();
15360 if (interned == NULL) {
15361 PyErr_Clear(); /* Don't leave an exception */
15362 return;
15363 }
15364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015366 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015368 if (t == NULL) {
15369 PyErr_Clear();
15370 return;
15371 }
15372 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015373 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015374 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015375 return;
15376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 /* The two references in interned are not counted by refcnt.
15378 The deallocator will take care of this */
15379 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015380 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015381}
15382
15383void
15384PyUnicode_InternImmortal(PyObject **p)
15385{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015386 PyUnicode_InternInPlace(p);
15387 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015388 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015389 Py_INCREF(*p);
15390 }
Walter Dörwald16807132007-05-25 13:52:07 +000015391}
15392
15393PyObject *
15394PyUnicode_InternFromString(const char *cp)
15395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 PyObject *s = PyUnicode_FromString(cp);
15397 if (s == NULL)
15398 return NULL;
15399 PyUnicode_InternInPlace(&s);
15400 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015401}
15402
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015403
15404#if defined(WITH_VALGRIND) || defined(__INSURE__)
15405static void
15406unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015409 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 Py_ssize_t i, n;
15411 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015412
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 if (interned == NULL || !PyDict_Check(interned))
15414 return;
15415 keys = PyDict_Keys(interned);
15416 if (keys == NULL || !PyList_Check(keys)) {
15417 PyErr_Clear();
15418 return;
15419 }
Walter Dörwald16807132007-05-25 13:52:07 +000015420
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015421 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 detector, interned unicode strings are not forcibly deallocated;
15423 rather, we give them their stolen references back, and then clear
15424 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015425
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015427#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015429 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015430#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015431 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015432 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015433 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015434 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015436 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 case SSTATE_NOT_INTERNED:
15438 /* XXX Shouldn't happen */
15439 break;
15440 case SSTATE_INTERNED_IMMORTAL:
15441 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015442 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 break;
15444 case SSTATE_INTERNED_MORTAL:
15445 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015446 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015447 break;
15448 default:
15449 Py_FatalError("Inconsistent interned string state.");
15450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015451 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015453#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015454 fprintf(stderr, "total size of all interned strings: "
15455 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15456 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015457#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015458 Py_DECREF(keys);
15459 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015460 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015461}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015462#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015463
15464
15465/********************* Unicode Iterator **************************/
15466
15467typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 PyObject_HEAD
15469 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015470 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015471} unicodeiterobject;
15472
15473static void
15474unicodeiter_dealloc(unicodeiterobject *it)
15475{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 _PyObject_GC_UNTRACK(it);
15477 Py_XDECREF(it->it_seq);
15478 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015479}
15480
15481static int
15482unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15483{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015484 Py_VISIT(it->it_seq);
15485 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015486}
15487
15488static PyObject *
15489unicodeiter_next(unicodeiterobject *it)
15490{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015491 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015492
Benjamin Peterson14339b62009-01-31 16:36:08 +000015493 assert(it != NULL);
15494 seq = it->it_seq;
15495 if (seq == NULL)
15496 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015497 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015499 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15500 int kind = PyUnicode_KIND(seq);
15501 void *data = PyUnicode_DATA(seq);
15502 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15503 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 if (item != NULL)
15505 ++it->it_index;
15506 return item;
15507 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015508
Benjamin Peterson14339b62009-01-31 16:36:08 +000015509 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015510 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015512}
15513
15514static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015515unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 Py_ssize_t len = 0;
15518 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015519 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015520 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015521}
15522
15523PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15524
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015525static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015526unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015527{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015528 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015529 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015530 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015531 it->it_seq, it->it_index);
15532 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015533 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015534 if (u == NULL)
15535 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015536 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015537 }
15538}
15539
15540PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15541
15542static PyObject *
15543unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15544{
15545 Py_ssize_t index = PyLong_AsSsize_t(state);
15546 if (index == -1 && PyErr_Occurred())
15547 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015548 if (it->it_seq != NULL) {
15549 if (index < 0)
15550 index = 0;
15551 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15552 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15553 it->it_index = index;
15554 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015555 Py_RETURN_NONE;
15556}
15557
15558PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15559
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015560static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015561 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015562 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015563 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15564 reduce_doc},
15565 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15566 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015567 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015568};
15569
15570PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015571 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15572 "str_iterator", /* tp_name */
15573 sizeof(unicodeiterobject), /* tp_basicsize */
15574 0, /* tp_itemsize */
15575 /* methods */
15576 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015577 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015578 0, /* tp_getattr */
15579 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015580 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015581 0, /* tp_repr */
15582 0, /* tp_as_number */
15583 0, /* tp_as_sequence */
15584 0, /* tp_as_mapping */
15585 0, /* tp_hash */
15586 0, /* tp_call */
15587 0, /* tp_str */
15588 PyObject_GenericGetAttr, /* tp_getattro */
15589 0, /* tp_setattro */
15590 0, /* tp_as_buffer */
15591 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15592 0, /* tp_doc */
15593 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15594 0, /* tp_clear */
15595 0, /* tp_richcompare */
15596 0, /* tp_weaklistoffset */
15597 PyObject_SelfIter, /* tp_iter */
15598 (iternextfunc)unicodeiter_next, /* tp_iternext */
15599 unicodeiter_methods, /* tp_methods */
15600 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015601};
15602
15603static PyObject *
15604unicode_iter(PyObject *seq)
15605{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015606 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015607
Benjamin Peterson14339b62009-01-31 16:36:08 +000015608 if (!PyUnicode_Check(seq)) {
15609 PyErr_BadInternalCall();
15610 return NULL;
15611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015612 if (PyUnicode_READY(seq) == -1)
15613 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015614 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15615 if (it == NULL)
15616 return NULL;
15617 it->it_index = 0;
15618 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015619 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015620 _PyObject_GC_TRACK(it);
15621 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015622}
15623
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015624
15625size_t
15626Py_UNICODE_strlen(const Py_UNICODE *u)
15627{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015628 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015629}
15630
15631Py_UNICODE*
15632Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15633{
15634 Py_UNICODE *u = s1;
15635 while ((*u++ = *s2++));
15636 return s1;
15637}
15638
15639Py_UNICODE*
15640Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15641{
15642 Py_UNICODE *u = s1;
15643 while ((*u++ = *s2++))
15644 if (n-- == 0)
15645 break;
15646 return s1;
15647}
15648
15649Py_UNICODE*
15650Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15651{
15652 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015653 u1 += wcslen(u1);
15654 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015655 return s1;
15656}
15657
15658int
15659Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15660{
15661 while (*s1 && *s2 && *s1 == *s2)
15662 s1++, s2++;
15663 if (*s1 && *s2)
15664 return (*s1 < *s2) ? -1 : +1;
15665 if (*s1)
15666 return 1;
15667 if (*s2)
15668 return -1;
15669 return 0;
15670}
15671
15672int
15673Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15674{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015675 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015676 for (; n != 0; n--) {
15677 u1 = *s1;
15678 u2 = *s2;
15679 if (u1 != u2)
15680 return (u1 < u2) ? -1 : +1;
15681 if (u1 == '\0')
15682 return 0;
15683 s1++;
15684 s2++;
15685 }
15686 return 0;
15687}
15688
15689Py_UNICODE*
15690Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15691{
15692 const Py_UNICODE *p;
15693 for (p = s; *p; p++)
15694 if (*p == c)
15695 return (Py_UNICODE*)p;
15696 return NULL;
15697}
15698
15699Py_UNICODE*
15700Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15701{
15702 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015703 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015704 while (p != s) {
15705 p--;
15706 if (*p == c)
15707 return (Py_UNICODE*)p;
15708 }
15709 return NULL;
15710}
Victor Stinner331ea922010-08-10 16:37:20 +000015711
Victor Stinner71133ff2010-09-01 23:43:53 +000015712Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015713PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015714{
Victor Stinner577db2c2011-10-11 22:12:48 +020015715 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015716 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015718 if (!PyUnicode_Check(unicode)) {
15719 PyErr_BadArgument();
15720 return NULL;
15721 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015722 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015723 if (u == NULL)
15724 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015725 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015726 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015727 PyErr_NoMemory();
15728 return NULL;
15729 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015730 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015731 size *= sizeof(Py_UNICODE);
15732 copy = PyMem_Malloc(size);
15733 if (copy == NULL) {
15734 PyErr_NoMemory();
15735 return NULL;
15736 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015737 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015738 return copy;
15739}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015740
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015741
Victor Stinner709d23d2019-05-02 14:56:30 -040015742static int
15743encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015744{
Victor Stinner709d23d2019-05-02 14:56:30 -040015745 int res;
15746 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15747 if (res == -2) {
15748 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15749 return -1;
15750 }
15751 if (res < 0) {
15752 PyErr_NoMemory();
15753 return -1;
15754 }
15755 return 0;
15756}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015757
Victor Stinner709d23d2019-05-02 14:56:30 -040015758
15759static int
15760config_get_codec_name(wchar_t **config_encoding)
15761{
15762 char *encoding;
15763 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15764 return -1;
15765 }
15766
15767 PyObject *name_obj = NULL;
15768 PyObject *codec = _PyCodec_Lookup(encoding);
15769 PyMem_RawFree(encoding);
15770
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015771 if (!codec)
15772 goto error;
15773
15774 name_obj = PyObject_GetAttrString(codec, "name");
15775 Py_CLEAR(codec);
15776 if (!name_obj) {
15777 goto error;
15778 }
15779
Victor Stinner709d23d2019-05-02 14:56:30 -040015780 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15781 Py_DECREF(name_obj);
15782 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015783 goto error;
15784 }
15785
Victor Stinner709d23d2019-05-02 14:56:30 -040015786 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15787 if (raw_wname == NULL) {
15788 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015789 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015790 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015791 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015792
15793 PyMem_RawFree(*config_encoding);
15794 *config_encoding = raw_wname;
15795
15796 PyMem_Free(wname);
15797 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015798
15799error:
15800 Py_XDECREF(codec);
15801 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015802 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015803}
15804
15805
Victor Stinner331a6a52019-05-27 16:39:22 +020015806static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015807init_stdio_encoding(PyInterpreterState *interp)
15808{
Victor Stinner709d23d2019-05-02 14:56:30 -040015809 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015810 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015811 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015812 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015813 "of the stdio encoding");
15814 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015815 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015816}
15817
15818
Victor Stinner709d23d2019-05-02 14:56:30 -040015819static int
15820init_fs_codec(PyInterpreterState *interp)
15821{
Victor Stinner331a6a52019-05-27 16:39:22 +020015822 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015823
15824 _Py_error_handler error_handler;
15825 error_handler = get_error_handler_wide(config->filesystem_errors);
15826 if (error_handler == _Py_ERROR_UNKNOWN) {
15827 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15828 return -1;
15829 }
15830
15831 char *encoding, *errors;
15832 if (encode_wstr_utf8(config->filesystem_encoding,
15833 &encoding,
15834 "filesystem_encoding") < 0) {
15835 return -1;
15836 }
15837
15838 if (encode_wstr_utf8(config->filesystem_errors,
15839 &errors,
15840 "filesystem_errors") < 0) {
15841 PyMem_RawFree(encoding);
15842 return -1;
15843 }
15844
15845 PyMem_RawFree(interp->fs_codec.encoding);
15846 interp->fs_codec.encoding = encoding;
15847 PyMem_RawFree(interp->fs_codec.errors);
15848 interp->fs_codec.errors = errors;
15849 interp->fs_codec.error_handler = error_handler;
15850
15851 /* At this point, PyUnicode_EncodeFSDefault() and
15852 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15853 the C implementation of the filesystem encoding. */
15854
15855 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15856 global configuration variables. */
15857 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15858 interp->fs_codec.errors) < 0) {
15859 PyErr_NoMemory();
15860 return -1;
15861 }
15862 return 0;
15863}
15864
15865
Victor Stinner331a6a52019-05-27 16:39:22 +020015866static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015867init_fs_encoding(PyInterpreterState *interp)
15868{
Victor Stinner709d23d2019-05-02 14:56:30 -040015869 /* Update the filesystem encoding to the normalized Python codec name.
15870 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15871 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015872 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015873 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015874 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015875 "of the filesystem encoding");
15876 }
15877
Victor Stinner709d23d2019-05-02 14:56:30 -040015878 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015879 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015880 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015881 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015882}
15883
15884
Victor Stinner331a6a52019-05-27 16:39:22 +020015885PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015886_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015887{
Victor Stinnerb45d2592019-06-20 00:05:23 +020015888 PyInterpreterState *interp = tstate->interp;
15889
Victor Stinner331a6a52019-05-27 16:39:22 +020015890 PyStatus status = init_fs_encoding(interp);
15891 if (_PyStatus_EXCEPTION(status)) {
15892 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015893 }
15894
15895 return init_stdio_encoding(interp);
15896}
15897
15898
Victor Stinner709d23d2019-05-02 14:56:30 -040015899#ifdef MS_WINDOWS
15900int
15901_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15902{
15903 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015904 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015905
15906 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15907 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15908 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15909 if (encoding == NULL || errors == NULL) {
15910 PyMem_RawFree(encoding);
15911 PyMem_RawFree(errors);
15912 PyErr_NoMemory();
15913 return -1;
15914 }
15915
15916 PyMem_RawFree(config->filesystem_encoding);
15917 config->filesystem_encoding = encoding;
15918 PyMem_RawFree(config->filesystem_errors);
15919 config->filesystem_errors = errors;
15920
15921 return init_fs_codec(interp);
15922}
15923#endif
15924
15925
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015926void
15927_PyUnicode_Fini(void)
15928{
15929#if defined(WITH_VALGRIND) || defined(__INSURE__)
15930 /* Insure++ is a memory analysis tool that aids in discovering
15931 * memory leaks and other memory problems. On Python exit, the
15932 * interned string dictionaries are flagged as being in use at exit
15933 * (which it is). Under normal circumstances, this is fine because
15934 * the memory will be automatically reclaimed by the system. Under
15935 * memory debugging, it's a huge source of useless noise, so we
15936 * trade off slower shutdown for less distraction in the memory
15937 * reports. -baw
15938 */
15939 unicode_release_interned();
15940#endif /* __INSURE__ */
15941
15942 Py_CLEAR(unicode_empty);
15943
15944 for (Py_ssize_t i = 0; i < 256; i++) {
15945 Py_CLEAR(unicode_latin1[i]);
15946 }
15947 _PyUnicode_ClearStaticStrings();
15948 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015949
15950 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15951 PyMem_RawFree(interp->fs_codec.encoding);
15952 interp->fs_codec.encoding = NULL;
15953 PyMem_RawFree(interp->fs_codec.errors);
15954 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015955}
15956
15957
Georg Brandl66c221e2010-10-14 07:04:07 +000015958/* A _string module, to export formatter_parser and formatter_field_name_split
15959 to the string.Formatter class implemented in Python. */
15960
15961static PyMethodDef _string_methods[] = {
15962 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15963 METH_O, PyDoc_STR("split the argument as a field name")},
15964 {"formatter_parser", (PyCFunction) formatter_parser,
15965 METH_O, PyDoc_STR("parse the argument as a format string")},
15966 {NULL, NULL}
15967};
15968
15969static struct PyModuleDef _string_module = {
15970 PyModuleDef_HEAD_INIT,
15971 "_string",
15972 PyDoc_STR("string helper module"),
15973 0,
15974 _string_methods,
15975 NULL,
15976 NULL,
15977 NULL,
15978 NULL
15979};
15980
15981PyMODINIT_FUNC
15982PyInit__string(void)
15983{
15984 return PyModule_Create(&_string_module);
15985}
15986
15987
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015988#ifdef __cplusplus
15989}
15990#endif