blob: aa933773233b587c12a0609495365165c6bc94fb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900268static inline void
269_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400270static PyObject *
271unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
272 const char *errors);
273static PyObject *
274unicode_decode_utf8(const char *s, Py_ssize_t size,
275 _Py_error_handler error_handler, const char *errors,
276 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200279static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281/* Single character Unicode strings in the Latin-1 range are being
282 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200283static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284
Christian Heimes190d79e2008-01-30 11:58:22 +0000285/* Fast detection of the most frequent whitespace characters */
286const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000C: * FORM FEED */
292/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 1, 1, 1, 1, 1, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x001C: * FILE SEPARATOR */
296/* case 0x001D: * GROUP SEPARATOR */
297/* case 0x001E: * RECORD SEPARATOR */
298/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 1, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200317static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100319static int unicode_modifiable(PyObject *unicode);
320
Victor Stinnerfe226c02011-10-03 03:52:20 +0200321
Alexander Belopolsky40018472011-02-26 01:02:56 +0000322static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100323_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200324static PyObject *
325_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
326static PyObject *
327_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
328
329static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000330unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100332 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
334
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335static void
336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100338 PyObject *unicode,
339 Py_ssize_t startpos, Py_ssize_t endpos,
340 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000341
Christian Heimes190d79e2008-01-30 11:58:22 +0000342/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200343static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000345/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000346/* 0x000B, * LINE TABULATION */
347/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x001C, * FILE SEPARATOR */
352/* 0x001D, * GROUP SEPARATOR */
353/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 1, 1, 1, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000359
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000368};
369
INADA Naoki3ae20562017-01-16 20:41:20 +0900370static int convert_uc(PyObject *obj, void *addr);
371
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300372#include "clinic/unicodeobject.c.h"
373
Victor Stinner3d4226a2018-08-29 22:21:32 +0200374_Py_error_handler
375_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200376{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200378 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 }
380 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200387 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_OTHER;
399}
400
Victor Stinner709d23d2019-05-02 14:56:30 -0400401
402static _Py_error_handler
403get_error_handler_wide(const wchar_t *errors)
404{
405 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
406 return _Py_ERROR_STRICT;
407 }
408 if (wcscmp(errors, L"surrogateescape") == 0) {
409 return _Py_ERROR_SURROGATEESCAPE;
410 }
411 if (wcscmp(errors, L"replace") == 0) {
412 return _Py_ERROR_REPLACE;
413 }
414 if (wcscmp(errors, L"ignore") == 0) {
415 return _Py_ERROR_IGNORE;
416 }
417 if (wcscmp(errors, L"backslashreplace") == 0) {
418 return _Py_ERROR_BACKSLASHREPLACE;
419 }
420 if (wcscmp(errors, L"surrogatepass") == 0) {
421 return _Py_ERROR_SURROGATEPASS;
422 }
423 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
424 return _Py_ERROR_XMLCHARREFREPLACE;
425 }
426 return _Py_ERROR_OTHER;
427}
428
429
Victor Stinner22eb6892019-06-26 00:51:05 +0200430static inline int
431unicode_check_encoding_errors(const char *encoding, const char *errors)
432{
433 if (encoding == NULL && errors == NULL) {
434 return 0;
435 }
436
437 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
438#ifndef Py_DEBUG
439 /* In release mode, only check in development mode (-X dev) */
440 if (!interp->config.dev_mode) {
441 return 0;
442 }
443#else
444 /* Always check in debug mode */
445#endif
446
447 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
448 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
449 if (!interp->fs_codec.encoding) {
450 return 0;
451 }
452
453 if (encoding != NULL) {
454 PyObject *handler = _PyCodec_Lookup(encoding);
455 if (handler == NULL) {
456 return -1;
457 }
458 Py_DECREF(handler);
459 }
460
461 if (errors != NULL) {
462 PyObject *handler = PyCodec_LookupError(errors);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468 return 0;
469}
470
471
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300472/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
473 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000474Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000475PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000476{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000477#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000478 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000479#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 /* This is actually an illegal character, so it should
481 not be passed to unichr. */
482 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000483#endif
484}
485
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200486int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100487_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200488{
489 PyASCIIObject *ascii;
490 unsigned int kind;
491
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200492 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200493
494 ascii = (PyASCIIObject *)op;
495 kind = ascii->state.kind;
496
Victor Stinnera3b334d2011-10-03 13:53:37 +0200497 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200498 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
499 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200500 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200501 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200502 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200503 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200504
Victor Stinnera41463c2011-10-04 01:05:08 +0200505 if (ascii->state.compact == 1) {
506 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200507 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
508 || kind == PyUnicode_2BYTE_KIND
509 || kind == PyUnicode_4BYTE_KIND);
510 _PyObject_ASSERT(op, ascii->state.ascii == 0);
511 _PyObject_ASSERT(op, ascii->state.ready == 1);
512 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100513 }
514 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200515 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
516
517 data = unicode->data.any;
518 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200519 _PyObject_ASSERT(op, ascii->length == 0);
520 _PyObject_ASSERT(op, ascii->hash == -1);
521 _PyObject_ASSERT(op, ascii->state.compact == 0);
522 _PyObject_ASSERT(op, ascii->state.ascii == 0);
523 _PyObject_ASSERT(op, ascii->state.ready == 0);
524 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
525 _PyObject_ASSERT(op, ascii->wstr != NULL);
526 _PyObject_ASSERT(op, data == NULL);
527 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 }
529 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200530 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
531 || kind == PyUnicode_2BYTE_KIND
532 || kind == PyUnicode_4BYTE_KIND);
533 _PyObject_ASSERT(op, ascii->state.compact == 0);
534 _PyObject_ASSERT(op, ascii->state.ready == 1);
535 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200536 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, compact->utf8 == data);
538 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200539 }
540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 }
543 }
544 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200545 if (
546#if SIZEOF_WCHAR_T == 2
547 kind == PyUnicode_2BYTE_KIND
548#else
549 kind == PyUnicode_4BYTE_KIND
550#endif
551 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200552 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200553 _PyObject_ASSERT(op, ascii->wstr == data);
554 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200556 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200557 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200558
559 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200560 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200562 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200564
565 /* check that the best kind is used: O(n) operation */
566 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 Py_ssize_t i;
568 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200569 void *data;
570 Py_UCS4 ch;
571
572 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 for (i=0; i < ascii->length; i++)
574 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200575 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 if (ch > maxchar)
577 maxchar = ch;
578 }
579 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100580 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200581 _PyObject_ASSERT(op, maxchar >= 128);
582 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100583 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200584 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200585 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200586 }
Victor Stinner77faf692011-11-20 18:56:05 +0100587 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200588 _PyObject_ASSERT(op, maxchar >= 0x100);
589 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100590 }
591 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200592 _PyObject_ASSERT(op, maxchar >= 0x10000);
593 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100594 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200595 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400597 return 1;
598}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200599
Victor Stinner910337b2011-10-03 03:20:16 +0200600
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100601static PyObject*
602unicode_result_wchar(PyObject *unicode)
603{
604#ifndef Py_DEBUG
605 Py_ssize_t len;
606
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100607 len = _PyUnicode_WSTR_LENGTH(unicode);
608 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100609 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200610 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100611 }
612
613 if (len == 1) {
614 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100615 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
617 Py_DECREF(unicode);
618 return latin1_char;
619 }
620 }
621
622 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200623 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 return NULL;
625 }
626#else
Victor Stinneraa771272012-10-04 02:32:58 +0200627 assert(Py_REFCNT(unicode) == 1);
628
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 /* don't make the result ready in debug mode to ensure that the caller
630 makes the string ready before using it */
631 assert(_PyUnicode_CheckConsistency(unicode, 1));
632#endif
633 return unicode;
634}
635
636static PyObject*
637unicode_result_ready(PyObject *unicode)
638{
639 Py_ssize_t length;
640
641 length = PyUnicode_GET_LENGTH(unicode);
642 if (length == 0) {
643 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100644 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200645 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100646 }
647 return unicode_empty;
648 }
649
650 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200651 void *data = PyUnicode_DATA(unicode);
652 int kind = PyUnicode_KIND(unicode);
653 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 if (ch < 256) {
655 PyObject *latin1_char = unicode_latin1[ch];
656 if (latin1_char != NULL) {
657 if (unicode != latin1_char) {
658 Py_INCREF(latin1_char);
659 Py_DECREF(unicode);
660 }
661 return latin1_char;
662 }
663 else {
664 assert(_PyUnicode_CheckConsistency(unicode, 1));
665 Py_INCREF(unicode);
666 unicode_latin1[ch] = unicode;
667 return unicode;
668 }
669 }
670 }
671
672 assert(_PyUnicode_CheckConsistency(unicode, 1));
673 return unicode;
674}
675
676static PyObject*
677unicode_result(PyObject *unicode)
678{
679 assert(_PyUnicode_CHECK(unicode));
680 if (PyUnicode_IS_READY(unicode))
681 return unicode_result_ready(unicode);
682 else
683 return unicode_result_wchar(unicode);
684}
685
Victor Stinnerc4b49542011-12-11 22:44:26 +0100686static PyObject*
687unicode_result_unchanged(PyObject *unicode)
688{
689 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500690 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100691 return NULL;
692 Py_INCREF(unicode);
693 return unicode;
694 }
695 else
696 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100697 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698}
699
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200700/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
701 ASCII, Latin1, UTF-8, etc. */
702static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200703backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
705{
Victor Stinnerad771582015-10-09 12:38:53 +0200706 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707 Py_UCS4 ch;
708 enum PyUnicode_Kind kind;
709 void *data;
710
711 assert(PyUnicode_IS_READY(unicode));
712 kind = PyUnicode_KIND(unicode);
713 data = PyUnicode_DATA(unicode);
714
715 size = 0;
716 /* determine replacement size */
717 for (i = collstart; i < collend; ++i) {
718 Py_ssize_t incr;
719
720 ch = PyUnicode_READ(kind, data, i);
721 if (ch < 0x100)
722 incr = 2+2;
723 else if (ch < 0x10000)
724 incr = 2+4;
725 else {
726 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200727 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 }
729 if (size > PY_SSIZE_T_MAX - incr) {
730 PyErr_SetString(PyExc_OverflowError,
731 "encoded result is too long for a Python string");
732 return NULL;
733 }
734 size += incr;
735 }
736
Victor Stinnerad771582015-10-09 12:38:53 +0200737 str = _PyBytesWriter_Prepare(writer, str, size);
738 if (str == NULL)
739 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200740
741 /* generate replacement */
742 for (i = collstart; i < collend; ++i) {
743 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200744 *str++ = '\\';
745 if (ch >= 0x00010000) {
746 *str++ = 'U';
747 *str++ = Py_hexdigits[(ch>>28)&0xf];
748 *str++ = Py_hexdigits[(ch>>24)&0xf];
749 *str++ = Py_hexdigits[(ch>>20)&0xf];
750 *str++ = Py_hexdigits[(ch>>16)&0xf];
751 *str++ = Py_hexdigits[(ch>>12)&0xf];
752 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200753 }
Victor Stinner797485e2015-10-09 03:17:30 +0200754 else if (ch >= 0x100) {
755 *str++ = 'u';
756 *str++ = Py_hexdigits[(ch>>12)&0xf];
757 *str++ = Py_hexdigits[(ch>>8)&0xf];
758 }
759 else
760 *str++ = 'x';
761 *str++ = Py_hexdigits[(ch>>4)&0xf];
762 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200763 }
764 return str;
765}
766
767/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
768 ASCII, Latin1, UTF-8, etc. */
769static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200770xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
772{
Victor Stinnerad771582015-10-09 12:38:53 +0200773 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200774 Py_UCS4 ch;
775 enum PyUnicode_Kind kind;
776 void *data;
777
778 assert(PyUnicode_IS_READY(unicode));
779 kind = PyUnicode_KIND(unicode);
780 data = PyUnicode_DATA(unicode);
781
782 size = 0;
783 /* determine replacement size */
784 for (i = collstart; i < collend; ++i) {
785 Py_ssize_t incr;
786
787 ch = PyUnicode_READ(kind, data, i);
788 if (ch < 10)
789 incr = 2+1+1;
790 else if (ch < 100)
791 incr = 2+2+1;
792 else if (ch < 1000)
793 incr = 2+3+1;
794 else if (ch < 10000)
795 incr = 2+4+1;
796 else if (ch < 100000)
797 incr = 2+5+1;
798 else if (ch < 1000000)
799 incr = 2+6+1;
800 else {
801 assert(ch <= MAX_UNICODE);
802 incr = 2+7+1;
803 }
804 if (size > PY_SSIZE_T_MAX - incr) {
805 PyErr_SetString(PyExc_OverflowError,
806 "encoded result is too long for a Python string");
807 return NULL;
808 }
809 size += incr;
810 }
811
Victor Stinnerad771582015-10-09 12:38:53 +0200812 str = _PyBytesWriter_Prepare(writer, str, size);
813 if (str == NULL)
814 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200815
816 /* generate replacement */
817 for (i = collstart; i < collend; ++i) {
818 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
819 }
820 return str;
821}
822
Thomas Wouters477c8d52006-05-27 19:21:47 +0000823/* --- Bloom Filters ----------------------------------------------------- */
824
825/* stuff to implement simple "bloom filters" for Unicode characters.
826 to keep things simple, we use a single bitmask, using the least 5
827 bits from each unicode characters as the bit index. */
828
829/* the linebreak mask is set up by Unicode_Init below */
830
Antoine Pitrouf068f942010-01-13 14:19:12 +0000831#if LONG_BIT >= 128
832#define BLOOM_WIDTH 128
833#elif LONG_BIT >= 64
834#define BLOOM_WIDTH 64
835#elif LONG_BIT >= 32
836#define BLOOM_WIDTH 32
837#else
838#error "LONG_BIT is smaller than 32"
839#endif
840
Thomas Wouters477c8d52006-05-27 19:21:47 +0000841#define BLOOM_MASK unsigned long
842
Serhiy Storchaka05997252013-01-26 12:14:02 +0200843static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844
Antoine Pitrouf068f942010-01-13 14:19:12 +0000845#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846
Benjamin Peterson29060642009-01-31 22:14:21 +0000847#define BLOOM_LINEBREAK(ch) \
848 ((ch) < 128U ? ascii_linebreak[(ch)] : \
849 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000850
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700851static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853{
Victor Stinnera85af502013-04-09 21:53:54 +0200854#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
855 do { \
856 TYPE *data = (TYPE *)PTR; \
857 TYPE *end = data + LEN; \
858 Py_UCS4 ch; \
859 for (; data != end; data++) { \
860 ch = *data; \
861 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
862 } \
863 break; \
864 } while (0)
865
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866 /* calculate simple bloom-style bitmask for a given unicode string */
867
Antoine Pitrouf068f942010-01-13 14:19:12 +0000868 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869
870 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200871 switch (kind) {
872 case PyUnicode_1BYTE_KIND:
873 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
874 break;
875 case PyUnicode_2BYTE_KIND:
876 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
877 break;
878 case PyUnicode_4BYTE_KIND:
879 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
880 break;
881 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700882 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200883 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200885
886#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000887}
888
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300889static int
890ensure_unicode(PyObject *obj)
891{
892 if (!PyUnicode_Check(obj)) {
893 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200894 "must be str, not %.100s",
895 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896 return -1;
897 }
898 return PyUnicode_READY(obj);
899}
900
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200901/* Compilation of templated routines */
902
903#include "stringlib/asciilib.h"
904#include "stringlib/fastsearch.h"
905#include "stringlib/partition.h"
906#include "stringlib/split.h"
907#include "stringlib/count.h"
908#include "stringlib/find.h"
909#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200910#include "stringlib/undef.h"
911
912#include "stringlib/ucs1lib.h"
913#include "stringlib/fastsearch.h"
914#include "stringlib/partition.h"
915#include "stringlib/split.h"
916#include "stringlib/count.h"
917#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300918#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200919#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200920#include "stringlib/undef.h"
921
922#include "stringlib/ucs2lib.h"
923#include "stringlib/fastsearch.h"
924#include "stringlib/partition.h"
925#include "stringlib/split.h"
926#include "stringlib/count.h"
927#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300928#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200930#include "stringlib/undef.h"
931
932#include "stringlib/ucs4lib.h"
933#include "stringlib/fastsearch.h"
934#include "stringlib/partition.h"
935#include "stringlib/split.h"
936#include "stringlib/count.h"
937#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300938#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200939#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200940#include "stringlib/undef.h"
941
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200942#include "stringlib/unicodedefs.h"
943#include "stringlib/fastsearch.h"
944#include "stringlib/count.h"
945#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100946#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200947
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948/* --- Unicode Object ----------------------------------------------------- */
949
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700950static inline Py_ssize_t
951findchar(const void *s, int kind,
952 Py_ssize_t size, Py_UCS4 ch,
953 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200955 switch (kind) {
956 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200957 if ((Py_UCS1) ch != ch)
958 return -1;
959 if (direction > 0)
960 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
961 else
962 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200963 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS2) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
968 else
969 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if (direction > 0)
972 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
973 else
974 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200975 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700976 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978}
979
Victor Stinnerafffce42012-10-03 23:03:17 +0200980#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000981/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200982 earlier.
983
984 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
985 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
986 invalid character in Unicode 6.0. */
987static void
988unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
989{
990 int kind = PyUnicode_KIND(unicode);
991 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
992 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
993 if (length <= old_length)
994 return;
995 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
996}
997#endif
998
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999static PyObject*
1000resize_compact(PyObject *unicode, Py_ssize_t length)
1001{
1002 Py_ssize_t char_size;
1003 Py_ssize_t struct_size;
1004 Py_ssize_t new_size;
1005 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001006 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001007#ifdef Py_DEBUG
1008 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1009#endif
1010
Victor Stinner79891572012-05-03 13:43:07 +02001011 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001012 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001013 assert(PyUnicode_IS_COMPACT(unicode));
1014
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001015 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001016 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 struct_size = sizeof(PyASCIIObject);
1018 else
1019 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001020 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1023 PyErr_NoMemory();
1024 return NULL;
1025 }
1026 new_size = (struct_size + (length + 1) * char_size);
1027
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001028 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1029 PyObject_DEL(_PyUnicode_UTF8(unicode));
1030 _PyUnicode_UTF8(unicode) = NULL;
1031 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1032 }
Victor Stinner84def372011-12-11 20:04:56 +01001033 _Py_DEC_REFTOTAL;
1034 _Py_ForgetReference(unicode);
1035
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001036 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001037 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001038 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 PyErr_NoMemory();
1040 return NULL;
1041 }
Victor Stinner84def372011-12-11 20:04:56 +01001042 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001044
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001046 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001048 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001049 _PyUnicode_WSTR_LENGTH(unicode) = length;
1050 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001051 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1052 PyObject_DEL(_PyUnicode_WSTR(unicode));
1053 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001054 if (!PyUnicode_IS_ASCII(unicode))
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001056 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001057#ifdef Py_DEBUG
1058 unicode_fill_invalid(unicode, old_length);
1059#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001060 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1061 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001062 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 return unicode;
1064}
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001067resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Victor Stinner95663112011-10-04 01:03:50 +02001069 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001070 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001073
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 if (PyUnicode_IS_READY(unicode)) {
1075 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001076 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001078#ifdef Py_DEBUG
1079 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1080#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081
1082 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001083 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1085 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001086
1087 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1088 PyErr_NoMemory();
1089 return -1;
1090 }
1091 new_size = (length + 1) * char_size;
1092
Victor Stinner7a9105a2011-12-12 00:13:42 +01001093 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1094 {
1095 PyObject_DEL(_PyUnicode_UTF8(unicode));
1096 _PyUnicode_UTF8(unicode) = NULL;
1097 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1098 }
1099
Victor Stinnerfe226c02011-10-03 03:52:20 +02001100 data = (PyObject *)PyObject_REALLOC(data, new_size);
1101 if (data == NULL) {
1102 PyErr_NoMemory();
1103 return -1;
1104 }
1105 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001106 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 }
1110 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001112 _PyUnicode_UTF8_LENGTH(unicode) = length;
1113 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001114 _PyUnicode_LENGTH(unicode) = length;
1115 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001116#ifdef Py_DEBUG
1117 unicode_fill_invalid(unicode, old_length);
1118#endif
Victor Stinner95663112011-10-04 01:03:50 +02001119 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001120 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001123 }
Victor Stinner95663112011-10-04 01:03:50 +02001124 assert(_PyUnicode_WSTR(unicode) != NULL);
1125
1126 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001127 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001128 PyErr_NoMemory();
1129 return -1;
1130 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001131 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001132 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001133 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001134 if (!wstr) {
1135 PyErr_NoMemory();
1136 return -1;
1137 }
1138 _PyUnicode_WSTR(unicode) = wstr;
1139 _PyUnicode_WSTR(unicode)[length] = 0;
1140 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001141 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 return 0;
1143}
1144
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145static PyObject*
1146resize_copy(PyObject *unicode, Py_ssize_t length)
1147{
1148 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001149 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001150 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001152 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001153
1154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1155 if (copy == NULL)
1156 return NULL;
1157
1158 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001161 }
1162 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001163 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001164
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001165 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001166 if (w == NULL)
1167 return NULL;
1168 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1169 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001170 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001171 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001172 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001173 }
1174}
1175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001177 Ux0000 terminated; some code (e.g. new_identifier)
1178 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
1180 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001181 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183*/
1184
Alexander Belopolsky40018472011-02-26 01:02:56 +00001185static PyUnicodeObject *
1186_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001188 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
Thomas Wouters477c8d52006-05-27 19:21:47 +00001191 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if (length == 0 && unicode_empty != NULL) {
1193 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001194 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 }
1196
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001197 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001198 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001199 return (PyUnicodeObject *)PyErr_NoMemory();
1200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201 if (length < 0) {
1202 PyErr_SetString(PyExc_SystemError,
1203 "Negative size passed to _PyUnicode_New");
1204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 }
1206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1208 if (unicode == NULL)
1209 return NULL;
1210 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001211
1212 _PyUnicode_WSTR_LENGTH(unicode) = length;
1213 _PyUnicode_HASH(unicode) = -1;
1214 _PyUnicode_STATE(unicode).interned = 0;
1215 _PyUnicode_STATE(unicode).kind = 0;
1216 _PyUnicode_STATE(unicode).compact = 0;
1217 _PyUnicode_STATE(unicode).ready = 0;
1218 _PyUnicode_STATE(unicode).ascii = 0;
1219 _PyUnicode_DATA_ANY(unicode) = NULL;
1220 _PyUnicode_LENGTH(unicode) = 0;
1221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1225 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001226 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001227 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001228 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230
Jeremy Hyltond8082792003-09-16 19:41:39 +00001231 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001232 * the caller fails before initializing str -- unicode_resize()
1233 * reads str[0], and the Keep-Alive optimization can keep memory
1234 * allocated for str alive across a call to unicode_dealloc(unicode).
1235 * We don't want unicode_resize to read uninitialized memory in
1236 * that case.
1237 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 _PyUnicode_WSTR(unicode)[0] = 0;
1239 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001240
Victor Stinner7931d9a2011-11-04 00:22:48 +01001241 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 return unicode;
1243}
1244
Victor Stinnerf42dc442011-10-02 23:33:16 +02001245static const char*
1246unicode_kind_name(PyObject *unicode)
1247{
Victor Stinner42dfd712011-10-03 14:41:45 +02001248 /* don't check consistency: unicode_kind_name() is called from
1249 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001250 if (!PyUnicode_IS_COMPACT(unicode))
1251 {
1252 if (!PyUnicode_IS_READY(unicode))
1253 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001254 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001255 {
1256 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001257 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001258 return "legacy ascii";
1259 else
1260 return "legacy latin1";
1261 case PyUnicode_2BYTE_KIND:
1262 return "legacy UCS2";
1263 case PyUnicode_4BYTE_KIND:
1264 return "legacy UCS4";
1265 default:
1266 return "<legacy invalid kind>";
1267 }
1268 }
1269 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001270 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001272 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001273 return "ascii";
1274 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001275 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001277 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001278 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001279 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280 default:
1281 return "<invalid compact kind>";
1282 }
1283}
1284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001287char *_PyUnicode_utf8(void *unicode_raw){
1288 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001289 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290}
1291
Victor Stinnera42de742018-11-22 10:25:22 +01001292void *_PyUnicode_compact_data(void *unicode_raw) {
1293 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 return _PyUnicode_COMPACT_DATA(unicode);
1295}
Victor Stinnera42de742018-11-22 10:25:22 +01001296void *_PyUnicode_data(void *unicode_raw) {
1297 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001298 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1300 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1301 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1302 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1303 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1304 return PyUnicode_DATA(unicode);
1305}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001306
1307void
1308_PyUnicode_Dump(PyObject *op)
1309{
1310 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1312 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1313 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001314
Victor Stinnera849a4b2011-10-03 12:12:11 +02001315 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001316 {
1317 if (ascii->state.ascii)
1318 data = (ascii + 1);
1319 else
1320 data = (compact + 1);
1321 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 else
1323 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001324 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1325 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001326
Victor Stinnera849a4b2011-10-03 12:12:11 +02001327 if (ascii->wstr == data)
1328 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001329 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001330
Victor Stinnera3b334d2011-10-03 13:53:37 +02001331 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001332 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1334 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001336 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001337 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001339}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
1342PyObject *
1343PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1344{
1345 PyObject *obj;
1346 PyCompactUnicodeObject *unicode;
1347 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001348 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 Py_ssize_t char_size;
1351 Py_ssize_t struct_size;
1352
1353 /* Optimization for empty strings */
1354 if (size == 0 && unicode_empty != NULL) {
1355 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001356 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 }
1358
Victor Stinner9e9d6892011-10-04 01:02:02 +02001359 is_ascii = 0;
1360 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 struct_size = sizeof(PyCompactUnicodeObject);
1362 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001363 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 char_size = 1;
1365 is_ascii = 1;
1366 struct_size = sizeof(PyASCIIObject);
1367 }
1368 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001369 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 char_size = 1;
1371 }
1372 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001373 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 char_size = 2;
1375 if (sizeof(wchar_t) == 2)
1376 is_sharing = 1;
1377 }
1378 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001379 if (maxchar > MAX_UNICODE) {
1380 PyErr_SetString(PyExc_SystemError,
1381 "invalid maximum character passed to PyUnicode_New");
1382 return NULL;
1383 }
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 4;
1386 if (sizeof(wchar_t) == 4)
1387 is_sharing = 1;
1388 }
1389
1390 /* Ensure we won't overflow the size. */
1391 if (size < 0) {
1392 PyErr_SetString(PyExc_SystemError,
1393 "Negative size passed to PyUnicode_New");
1394 return NULL;
1395 }
1396 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1397 return PyErr_NoMemory();
1398
1399 /* Duplicated allocation code from _PyObject_New() instead of a call to
1400 * PyObject_New() so we are able to allocate space for the object and
1401 * it's data buffer.
1402 */
1403 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1404 if (obj == NULL)
1405 return PyErr_NoMemory();
1406 obj = PyObject_INIT(obj, &PyUnicode_Type);
1407 if (obj == NULL)
1408 return NULL;
1409
1410 unicode = (PyCompactUnicodeObject *)obj;
1411 if (is_ascii)
1412 data = ((PyASCIIObject*)obj) + 1;
1413 else
1414 data = unicode + 1;
1415 _PyUnicode_LENGTH(unicode) = size;
1416 _PyUnicode_HASH(unicode) = -1;
1417 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001418 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 _PyUnicode_STATE(unicode).compact = 1;
1420 _PyUnicode_STATE(unicode).ready = 1;
1421 _PyUnicode_STATE(unicode).ascii = is_ascii;
1422 if (is_ascii) {
1423 ((char*)data)[size] = 0;
1424 _PyUnicode_WSTR(unicode) = NULL;
1425 }
Victor Stinner8f825062012-04-27 13:55:39 +02001426 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 ((char*)data)[size] = 0;
1428 _PyUnicode_WSTR(unicode) = NULL;
1429 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001431 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 else {
1434 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001435 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001436 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001438 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 ((Py_UCS4*)data)[size] = 0;
1440 if (is_sharing) {
1441 _PyUnicode_WSTR_LENGTH(unicode) = size;
1442 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1443 }
1444 else {
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 }
1448 }
Victor Stinner8f825062012-04-27 13:55:39 +02001449#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001450 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001451#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001452 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 return obj;
1454}
1455
1456#if SIZEOF_WCHAR_T == 2
1457/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1458 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001459 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 This function assumes that unicode can hold one more code point than wstr
1462 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001463static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001465 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466{
1467 const wchar_t *iter;
1468 Py_UCS4 *ucs4_out;
1469
Victor Stinner910337b2011-10-03 03:20:16 +02001470 assert(unicode != NULL);
1471 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1473 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1474
1475 for (iter = begin; iter < end; ) {
1476 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1477 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001478 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1479 && (iter+1) < end
1480 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 {
Victor Stinner551ac952011-11-29 22:58:13 +01001482 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 iter += 2;
1484 }
1485 else {
1486 *ucs4_out++ = *iter;
1487 iter++;
1488 }
1489 }
1490 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1491 _PyUnicode_GET_LENGTH(unicode)));
1492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493}
1494#endif
1495
Victor Stinnercd9950f2011-10-02 00:34:53 +02001496static int
Victor Stinner488fa492011-12-12 00:01:39 +01001497unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001498{
Victor Stinner488fa492011-12-12 00:01:39 +01001499 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001500 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001501 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001502 return -1;
1503 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001504 return 0;
1505}
1506
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001507static int
1508_copy_characters(PyObject *to, Py_ssize_t to_start,
1509 PyObject *from, Py_ssize_t from_start,
1510 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 unsigned int from_kind, to_kind;
1513 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514
Victor Stinneree4544c2012-05-09 22:24:08 +02001515 assert(0 <= how_many);
1516 assert(0 <= from_start);
1517 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001519 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001520 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521
Victor Stinnerd3f08822012-05-29 12:57:52 +02001522 assert(PyUnicode_Check(to));
1523 assert(PyUnicode_IS_READY(to));
1524 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1525
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001526 if (how_many == 0)
1527 return 0;
1528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001532 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533
Victor Stinnerf1852262012-06-16 16:38:26 +02001534#ifdef Py_DEBUG
1535 if (!check_maxchar
1536 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1537 {
1538 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1539 Py_UCS4 ch;
1540 Py_ssize_t i;
1541 for (i=0; i < how_many; i++) {
1542 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1543 assert(ch <= to_maxchar);
1544 }
1545 }
1546#endif
1547
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001549 if (check_maxchar
1550 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1551 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001552 /* Writing Latin-1 characters into an ASCII string requires to
1553 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001554 Py_UCS4 max_char;
1555 max_char = ucs1lib_find_max_char(from_data,
1556 (Py_UCS1*)from_data + how_many);
1557 if (max_char >= 128)
1558 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 }
Christian Heimesf051e432016-09-13 20:22:02 +02001560 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001561 (char*)from_data + from_kind * from_start,
1562 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001564 else if (from_kind == PyUnicode_1BYTE_KIND
1565 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS1, Py_UCS2,
1569 PyUnicode_1BYTE_DATA(from) + from_start,
1570 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_2BYTE_DATA(to) + to_start
1572 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001573 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001574 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001575 && to_kind == PyUnicode_4BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS1, Py_UCS4,
1579 PyUnicode_1BYTE_DATA(from) + from_start,
1580 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_4BYTE_DATA(to) + to_start
1582 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001583 }
1584 else if (from_kind == PyUnicode_2BYTE_KIND
1585 && to_kind == PyUnicode_4BYTE_KIND)
1586 {
1587 _PyUnicode_CONVERT_BYTES(
1588 Py_UCS2, Py_UCS4,
1589 PyUnicode_2BYTE_DATA(from) + from_start,
1590 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1591 PyUnicode_4BYTE_DATA(to) + to_start
1592 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001593 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001594 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1596
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001597 if (!check_maxchar) {
1598 if (from_kind == PyUnicode_2BYTE_KIND
1599 && to_kind == PyUnicode_1BYTE_KIND)
1600 {
1601 _PyUnicode_CONVERT_BYTES(
1602 Py_UCS2, Py_UCS1,
1603 PyUnicode_2BYTE_DATA(from) + from_start,
1604 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1605 PyUnicode_1BYTE_DATA(to) + to_start
1606 );
1607 }
1608 else if (from_kind == PyUnicode_4BYTE_KIND
1609 && to_kind == PyUnicode_1BYTE_KIND)
1610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS4, Py_UCS1,
1613 PyUnicode_4BYTE_DATA(from) + from_start,
1614 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_1BYTE_DATA(to) + to_start
1616 );
1617 }
1618 else if (from_kind == PyUnicode_4BYTE_KIND
1619 && to_kind == PyUnicode_2BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS4, Py_UCS2,
1623 PyUnicode_4BYTE_DATA(from) + from_start,
1624 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_2BYTE_DATA(to) + to_start
1626 );
1627 }
1628 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001629 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001630 }
1631 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001632 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001633 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001634 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001635 Py_ssize_t i;
1636
Victor Stinnera0702ab2011-09-29 14:14:38 +02001637 for (i=0; i < how_many; i++) {
1638 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 if (ch > to_maxchar)
1640 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001641 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1642 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001643 }
1644 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 return 0;
1646}
1647
Victor Stinnerd3f08822012-05-29 12:57:52 +02001648void
1649_PyUnicode_FastCopyCharacters(
1650 PyObject *to, Py_ssize_t to_start,
1651 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652{
1653 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1654}
1655
1656Py_ssize_t
1657PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1658 PyObject *from, Py_ssize_t from_start,
1659 Py_ssize_t how_many)
1660{
1661 int err;
1662
1663 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1664 PyErr_BadInternalCall();
1665 return -1;
1666 }
1667
Benjamin Petersonbac79492012-01-14 13:34:47 -05001668 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001669 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001670 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001671 return -1;
1672
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001673 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001674 PyErr_SetString(PyExc_IndexError, "string index out of range");
1675 return -1;
1676 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001677 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001678 PyErr_SetString(PyExc_IndexError, "string index out of range");
1679 return -1;
1680 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001681 if (how_many < 0) {
1682 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1683 return -1;
1684 }
1685 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1687 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001688 "Cannot write %zi characters at %zi "
1689 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 how_many, to_start, PyUnicode_GET_LENGTH(to));
1691 return -1;
1692 }
1693
1694 if (how_many == 0)
1695 return 0;
1696
Victor Stinner488fa492011-12-12 00:01:39 +01001697 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001698 return -1;
1699
1700 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1701 if (err) {
1702 PyErr_Format(PyExc_SystemError,
1703 "Cannot copy %s characters "
1704 "into a string of %s characters",
1705 unicode_kind_name(from),
1706 unicode_kind_name(to));
1707 return -1;
1708 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001709 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710}
1711
Victor Stinner17222162011-09-28 22:15:37 +02001712/* Find the maximum code point and count the number of surrogate pairs so a
1713 correct string length can be computed before converting a string to UCS4.
1714 This function counts single surrogates as a character and not as a pair.
1715
1716 Return 0 on success, or -1 on error. */
1717static int
1718find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1719 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720{
1721 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001722 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723
Victor Stinnerc53be962011-10-02 21:33:54 +02001724 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 *num_surrogates = 0;
1726 *maxchar = 0;
1727
1728 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001730 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1731 && (iter+1) < end
1732 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1733 {
1734 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1735 ++(*num_surrogates);
1736 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 }
1738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001740 {
1741 ch = *iter;
1742 iter++;
1743 }
1744 if (ch > *maxchar) {
1745 *maxchar = ch;
1746 if (*maxchar > MAX_UNICODE) {
1747 PyErr_Format(PyExc_ValueError,
1748 "character U+%x is not in range [U+0000; U+10ffff]",
1749 ch);
1750 return -1;
1751 }
1752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 }
1754 return 0;
1755}
1756
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001757int
1758_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759{
1760 wchar_t *end;
1761 Py_UCS4 maxchar = 0;
1762 Py_ssize_t num_surrogates;
1763#if SIZEOF_WCHAR_T == 2
1764 Py_ssize_t length_wo_surrogates;
1765#endif
1766
Georg Brandl7597add2011-10-05 16:36:47 +02001767 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001768 strings were created using _PyObject_New() and where no canonical
1769 representation (the str field) has been set yet aka strings
1770 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001771 assert(_PyUnicode_CHECK(unicode));
1772 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001775 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001776 /* Actually, it should neither be interned nor be anything else: */
1777 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001780 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001781 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783
1784 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1786 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 PyErr_NoMemory();
1788 return -1;
1789 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001790 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 _PyUnicode_WSTR(unicode), end,
1792 PyUnicode_1BYTE_DATA(unicode));
1793 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1794 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1795 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1796 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001797 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001798 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001799 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001802 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001803 _PyUnicode_UTF8(unicode) = NULL;
1804 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 PyObject_FREE(_PyUnicode_WSTR(unicode));
1807 _PyUnicode_WSTR(unicode) = NULL;
1808 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1809 }
1810 /* In this case we might have to convert down from 4-byte native
1811 wchar_t to 2-byte unicode. */
1812 else if (maxchar < 65536) {
1813 assert(num_surrogates == 0 &&
1814 "FindMaxCharAndNumSurrogatePairs() messed up");
1815
Victor Stinner506f5922011-09-28 22:34:18 +02001816#if SIZEOF_WCHAR_T == 2
1817 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001819 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1820 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1821 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001822 _PyUnicode_UTF8(unicode) = NULL;
1823 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001824#else
1825 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001827 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001828 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001829 PyErr_NoMemory();
1830 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 }
Victor Stinner506f5922011-09-28 22:34:18 +02001832 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1833 _PyUnicode_WSTR(unicode), end,
1834 PyUnicode_2BYTE_DATA(unicode));
1835 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1836 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1837 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001838 _PyUnicode_UTF8(unicode) = NULL;
1839 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyObject_FREE(_PyUnicode_WSTR(unicode));
1841 _PyUnicode_WSTR(unicode) = NULL;
1842 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1843#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1846 else {
1847#if SIZEOF_WCHAR_T == 2
1848 /* in case the native representation is 2-bytes, we need to allocate a
1849 new normalized 4-byte version. */
1850 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001851 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1852 PyErr_NoMemory();
1853 return -1;
1854 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001855 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1856 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 PyErr_NoMemory();
1858 return -1;
1859 }
1860 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1861 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001862 _PyUnicode_UTF8(unicode) = NULL;
1863 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001864 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1865 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001866 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 PyObject_FREE(_PyUnicode_WSTR(unicode));
1868 _PyUnicode_WSTR(unicode) = NULL;
1869 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1870#else
1871 assert(num_surrogates == 0);
1872
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001875 _PyUnicode_UTF8(unicode) = NULL;
1876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1878#endif
1879 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1880 }
1881 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001882 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 return 0;
1884}
1885
Alexander Belopolsky40018472011-02-26 01:02:56 +00001886static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001887unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888{
Walter Dörwald16807132007-05-25 13:52:07 +00001889 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001890 case SSTATE_NOT_INTERNED:
1891 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001892
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 case SSTATE_INTERNED_MORTAL:
1894 /* revive dead object temporarily for DelItem */
1895 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001896 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 Py_FatalError(
1898 "deletion of interned string failed");
1899 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001900
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_INTERNED_IMMORTAL:
1902 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001903 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001904
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 default:
1906 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001907 }
1908
Victor Stinner03490912011-10-03 23:45:12 +02001909 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001911 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001912 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1914 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001916 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917}
1918
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001919#ifdef Py_DEBUG
1920static int
1921unicode_is_singleton(PyObject *unicode)
1922{
1923 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1924 if (unicode == unicode_empty)
1925 return 1;
1926 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1927 {
1928 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1929 if (ch < 256 && unicode_latin1[ch] == unicode)
1930 return 1;
1931 }
1932 return 0;
1933}
1934#endif
1935
Alexander Belopolsky40018472011-02-26 01:02:56 +00001936static int
Victor Stinner488fa492011-12-12 00:01:39 +01001937unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001938{
Victor Stinner488fa492011-12-12 00:01:39 +01001939 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 if (Py_REFCNT(unicode) != 1)
1941 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001942 if (_PyUnicode_HASH(unicode) != -1)
1943 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001944 if (PyUnicode_CHECK_INTERNED(unicode))
1945 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001946 if (!PyUnicode_CheckExact(unicode))
1947 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001948#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001949 /* singleton refcount is greater than 1 */
1950 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001951#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 1;
1953}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001954
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955static int
1956unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1957{
1958 PyObject *unicode;
1959 Py_ssize_t old_length;
1960
1961 assert(p_unicode != NULL);
1962 unicode = *p_unicode;
1963
1964 assert(unicode != NULL);
1965 assert(PyUnicode_Check(unicode));
1966 assert(0 <= length);
1967
Victor Stinner910337b2011-10-03 03:20:16 +02001968 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001969 old_length = PyUnicode_WSTR_LENGTH(unicode);
1970 else
1971 old_length = PyUnicode_GET_LENGTH(unicode);
1972 if (old_length == length)
1973 return 0;
1974
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001975 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001976 _Py_INCREF_UNICODE_EMPTY();
1977 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001978 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001979 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001980 return 0;
1981 }
1982
Victor Stinner488fa492011-12-12 00:01:39 +01001983 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 PyObject *copy = resize_copy(unicode, length);
1985 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001986 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001987 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001989 }
1990
Victor Stinnerfe226c02011-10-03 03:52:20 +02001991 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001992 PyObject *new_unicode = resize_compact(unicode, length);
1993 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001994 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001995 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001996 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001998 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999}
2000
Alexander Belopolsky40018472011-02-26 01:02:56 +00002001int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002002PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002003{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002004 PyObject *unicode;
2005 if (p_unicode == NULL) {
2006 PyErr_BadInternalCall();
2007 return -1;
2008 }
2009 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002010 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 {
2012 PyErr_BadInternalCall();
2013 return -1;
2014 }
2015 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002016}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002017
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002018/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002019
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002020 WARNING: The function doesn't copy the terminating null character and
2021 doesn't check the maximum character (may write a latin1 character in an
2022 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002023static void
2024unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2025 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002026{
2027 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2028 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002029 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002030
2031 switch (kind) {
2032 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002033 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002034#ifdef Py_DEBUG
2035 if (PyUnicode_IS_ASCII(unicode)) {
2036 Py_UCS4 maxchar = ucs1lib_find_max_char(
2037 (const Py_UCS1*)str,
2038 (const Py_UCS1*)str + len);
2039 assert(maxchar < 128);
2040 }
2041#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002042 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002043 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002044 }
2045 case PyUnicode_2BYTE_KIND: {
2046 Py_UCS2 *start = (Py_UCS2 *)data + index;
2047 Py_UCS2 *ucs2 = start;
2048 assert(index <= PyUnicode_GET_LENGTH(unicode));
2049
Victor Stinner184252a2012-06-16 02:57:41 +02002050 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002051 *ucs2 = (Py_UCS2)*str;
2052
2053 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002054 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002055 }
2056 default: {
2057 Py_UCS4 *start = (Py_UCS4 *)data + index;
2058 Py_UCS4 *ucs4 = start;
2059 assert(kind == PyUnicode_4BYTE_KIND);
2060 assert(index <= PyUnicode_GET_LENGTH(unicode));
2061
Victor Stinner184252a2012-06-16 02:57:41 +02002062 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002063 *ucs4 = (Py_UCS4)*str;
2064
2065 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 }
2067 }
2068}
2069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070static PyObject*
2071get_latin1_char(unsigned char ch)
2072{
Victor Stinnera464fc12011-10-02 20:39:30 +02002073 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002075 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 if (!unicode)
2077 return NULL;
2078 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002079 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 unicode_latin1[ch] = unicode;
2081 }
2082 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002083 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084}
2085
Victor Stinner985a82a2014-01-03 12:53:47 +01002086static PyObject*
2087unicode_char(Py_UCS4 ch)
2088{
2089 PyObject *unicode;
2090
2091 assert(ch <= MAX_UNICODE);
2092
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002093 if (ch < 256)
2094 return get_latin1_char(ch);
2095
Victor Stinner985a82a2014-01-03 12:53:47 +01002096 unicode = PyUnicode_New(1, ch);
2097 if (unicode == NULL)
2098 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002099
2100 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2101 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002102 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002103 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002104 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2105 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2106 }
2107 assert(_PyUnicode_CheckConsistency(unicode, 1));
2108 return unicode;
2109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002114 if (u == NULL)
2115 return (PyObject*)_PyUnicode_New(size);
2116
2117 if (size < 0) {
2118 PyErr_BadInternalCall();
2119 return NULL;
2120 }
2121
2122 return PyUnicode_FromWideChar(u, size);
2123}
2124
2125PyObject *
2126PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2127{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002128 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 Py_UCS4 maxchar = 0;
2130 Py_ssize_t num_surrogates;
2131
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002132 if (u == NULL && size != 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 if (size == -1) {
2138 size = wcslen(u);
2139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002141 /* If the Unicode data is known at construction time, we can apply
2142 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002145 if (size == 0)
2146 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 /* Single character Unicode objects in the Latin-1 range are
2149 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 return get_latin1_char((unsigned char)*u);
2152
2153 /* If not empty and not single character, copy the Unicode data
2154 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002155 if (find_maxchar_surrogates(u, u + size,
2156 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 return NULL;
2158
Victor Stinner8faf8212011-12-08 22:14:11 +01002159 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 if (!unicode)
2161 return NULL;
2162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 switch (PyUnicode_KIND(unicode)) {
2164 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002165 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2167 break;
2168 case PyUnicode_2BYTE_KIND:
2169#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002170 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002172 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2174#endif
2175 break;
2176 case PyUnicode_4BYTE_KIND:
2177#if SIZEOF_WCHAR_T == 2
2178 /* This is the only case which has to process surrogates, thus
2179 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002180 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181#else
2182 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002183 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184#endif
2185 break;
2186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002187 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002190 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191}
2192
Alexander Belopolsky40018472011-02-26 01:02:56 +00002193PyObject *
2194PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002195{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002196 if (size < 0) {
2197 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 return NULL;
2200 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002201 if (u != NULL)
2202 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2203 else
2204 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002205}
2206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002209{
2210 size_t size = strlen(u);
2211 if (size > PY_SSIZE_T_MAX) {
2212 PyErr_SetString(PyExc_OverflowError, "input too long");
2213 return NULL;
2214 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002215 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002216}
2217
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002218PyObject *
2219_PyUnicode_FromId(_Py_Identifier *id)
2220{
2221 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002222 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2223 strlen(id->string),
2224 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002225 if (!id->object)
2226 return NULL;
2227 PyUnicode_InternInPlace(&id->object);
2228 assert(!id->next);
2229 id->next = static_strings;
2230 static_strings = id;
2231 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002232 return id->object;
2233}
2234
2235void
2236_PyUnicode_ClearStaticStrings()
2237{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002238 _Py_Identifier *tmp, *s = static_strings;
2239 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002240 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002241 tmp = s->next;
2242 s->next = NULL;
2243 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002244 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002245 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002246}
2247
Benjamin Peterson0df54292012-03-26 14:50:32 -04002248/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Victor Stinnerd3f08822012-05-29 12:57:52 +02002250PyObject*
2251_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002252{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002253 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002254 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002255 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002256#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002257 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002258#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002259 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002260 }
Victor Stinner785938e2011-12-11 20:09:03 +01002261 unicode = PyUnicode_New(size, 127);
2262 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002263 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002264 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2265 assert(_PyUnicode_CheckConsistency(unicode, 1));
2266 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002267}
2268
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002269static Py_UCS4
2270kind_maxchar_limit(unsigned int kind)
2271{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002272 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002273 case PyUnicode_1BYTE_KIND:
2274 return 0x80;
2275 case PyUnicode_2BYTE_KIND:
2276 return 0x100;
2277 case PyUnicode_4BYTE_KIND:
2278 return 0x10000;
2279 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002280 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002281 }
2282}
2283
Victor Stinner702c7342011-10-05 13:50:52 +02002284static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002285_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002288 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002289
Serhiy Storchaka678db842013-01-26 12:16:36 +02002290 if (size == 0)
2291 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002292 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002293 if (size == 1)
2294 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002296 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002297 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 if (!res)
2299 return NULL;
2300 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002301 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303}
2304
Victor Stinnere57b1c02011-09-28 22:20:48 +02002305static PyObject*
2306_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307{
2308 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002309 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002310
Serhiy Storchaka678db842013-01-26 12:16:36 +02002311 if (size == 0)
2312 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002313 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002314 if (size == 1)
2315 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002317 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002318 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319 if (!res)
2320 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002321 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 else {
2324 _PyUnicode_CONVERT_BYTES(
2325 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2326 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002327 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 return res;
2329}
2330
Victor Stinnere57b1c02011-09-28 22:20:48 +02002331static PyObject*
2332_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333{
2334 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002335 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336
Serhiy Storchaka678db842013-01-26 12:16:36 +02002337 if (size == 0)
2338 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002339 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002340 if (size == 1)
2341 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002342
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002343 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002344 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (!res)
2346 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002347 if (max_char < 256)
2348 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2349 PyUnicode_1BYTE_DATA(res));
2350 else if (max_char < 0x10000)
2351 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2352 PyUnicode_2BYTE_DATA(res));
2353 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002355 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 return res;
2357}
2358
2359PyObject*
2360PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2361{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002362 if (size < 0) {
2363 PyErr_SetString(PyExc_ValueError, "size must be positive");
2364 return NULL;
2365 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002366 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002368 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002370 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002372 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002373 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002374 PyErr_SetString(PyExc_SystemError, "invalid kind");
2375 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377}
2378
Victor Stinnerece58de2012-04-23 23:36:38 +02002379Py_UCS4
2380_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2381{
2382 enum PyUnicode_Kind kind;
2383 void *startptr, *endptr;
2384
2385 assert(PyUnicode_IS_READY(unicode));
2386 assert(0 <= start);
2387 assert(end <= PyUnicode_GET_LENGTH(unicode));
2388 assert(start <= end);
2389
2390 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2391 return PyUnicode_MAX_CHAR_VALUE(unicode);
2392
2393 if (start == end)
2394 return 127;
2395
Victor Stinner94d558b2012-04-27 22:26:58 +02002396 if (PyUnicode_IS_ASCII(unicode))
2397 return 127;
2398
Victor Stinnerece58de2012-04-23 23:36:38 +02002399 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002400 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002401 endptr = (char *)startptr + end * kind;
2402 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002403 switch(kind) {
2404 case PyUnicode_1BYTE_KIND:
2405 return ucs1lib_find_max_char(startptr, endptr);
2406 case PyUnicode_2BYTE_KIND:
2407 return ucs2lib_find_max_char(startptr, endptr);
2408 case PyUnicode_4BYTE_KIND:
2409 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002410 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002411 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002412 }
2413}
2414
Victor Stinner25a4b292011-10-06 12:31:55 +02002415/* Ensure that a string uses the most efficient storage, if it is not the
2416 case: create a new string with of the right kind. Write NULL into *p_unicode
2417 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002418static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002419unicode_adjust_maxchar(PyObject **p_unicode)
2420{
2421 PyObject *unicode, *copy;
2422 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002424 unsigned int kind;
2425
2426 assert(p_unicode != NULL);
2427 unicode = *p_unicode;
2428 assert(PyUnicode_IS_READY(unicode));
2429 if (PyUnicode_IS_ASCII(unicode))
2430 return;
2431
2432 len = PyUnicode_GET_LENGTH(unicode);
2433 kind = PyUnicode_KIND(unicode);
2434 if (kind == PyUnicode_1BYTE_KIND) {
2435 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002436 max_char = ucs1lib_find_max_char(u, u + len);
2437 if (max_char >= 128)
2438 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 }
2440 else if (kind == PyUnicode_2BYTE_KIND) {
2441 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002442 max_char = ucs2lib_find_max_char(u, u + len);
2443 if (max_char >= 256)
2444 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002445 }
2446 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002448 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002449 max_char = ucs4lib_find_max_char(u, u + len);
2450 if (max_char >= 0x10000)
2451 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002452 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002453 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002454 if (copy != NULL)
2455 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002456 Py_DECREF(unicode);
2457 *p_unicode = copy;
2458}
2459
Victor Stinner034f6cf2011-09-30 02:26:44 +02002460PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002461_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002462{
Victor Stinner87af4f22011-11-21 23:03:47 +01002463 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002464 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002465
Victor Stinner034f6cf2011-09-30 02:26:44 +02002466 if (!PyUnicode_Check(unicode)) {
2467 PyErr_BadInternalCall();
2468 return NULL;
2469 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002470 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002471 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002472
Victor Stinner87af4f22011-11-21 23:03:47 +01002473 length = PyUnicode_GET_LENGTH(unicode);
2474 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002475 if (!copy)
2476 return NULL;
2477 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2478
Christian Heimesf051e432016-09-13 20:22:02 +02002479 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002480 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002481 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002482 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002483}
2484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485
Victor Stinnerbc603d12011-10-02 01:00:40 +02002486/* Widen Unicode objects to larger buffers. Don't write terminating null
2487 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488
2489void*
2490_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2491{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002492 Py_ssize_t len;
2493 void *result;
2494 unsigned int skind;
2495
Benjamin Petersonbac79492012-01-14 13:34:47 -05002496 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002497 return NULL;
2498
2499 len = PyUnicode_GET_LENGTH(s);
2500 skind = PyUnicode_KIND(s);
2501 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002502 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 return NULL;
2504 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002505 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002506 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002507 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002508 if (!result)
2509 return PyErr_NoMemory();
2510 assert(skind == PyUnicode_1BYTE_KIND);
2511 _PyUnicode_CONVERT_BYTES(
2512 Py_UCS1, Py_UCS2,
2513 PyUnicode_1BYTE_DATA(s),
2514 PyUnicode_1BYTE_DATA(s) + len,
2515 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002518 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002519 if (!result)
2520 return PyErr_NoMemory();
2521 if (skind == PyUnicode_2BYTE_KIND) {
2522 _PyUnicode_CONVERT_BYTES(
2523 Py_UCS2, Py_UCS4,
2524 PyUnicode_2BYTE_DATA(s),
2525 PyUnicode_2BYTE_DATA(s) + len,
2526 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002528 else {
2529 assert(skind == PyUnicode_1BYTE_KIND);
2530 _PyUnicode_CONVERT_BYTES(
2531 Py_UCS1, Py_UCS4,
2532 PyUnicode_1BYTE_DATA(s),
2533 PyUnicode_1BYTE_DATA(s) + len,
2534 result);
2535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002537 default:
2538 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 }
Victor Stinner01698042011-10-04 00:04:26 +02002540 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 return NULL;
2542}
2543
2544static Py_UCS4*
2545as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2546 int copy_null)
2547{
2548 int kind;
2549 void *data;
2550 Py_ssize_t len, targetlen;
2551 if (PyUnicode_READY(string) == -1)
2552 return NULL;
2553 kind = PyUnicode_KIND(string);
2554 data = PyUnicode_DATA(string);
2555 len = PyUnicode_GET_LENGTH(string);
2556 targetlen = len;
2557 if (copy_null)
2558 targetlen++;
2559 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002560 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 if (!target) {
2562 PyErr_NoMemory();
2563 return NULL;
2564 }
2565 }
2566 else {
2567 if (targetsize < targetlen) {
2568 PyErr_Format(PyExc_SystemError,
2569 "string is longer than the buffer");
2570 if (copy_null && 0 < targetsize)
2571 target[0] = 0;
2572 return NULL;
2573 }
2574 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002575 if (kind == PyUnicode_1BYTE_KIND) {
2576 Py_UCS1 *start = (Py_UCS1 *) data;
2577 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002579 else if (kind == PyUnicode_2BYTE_KIND) {
2580 Py_UCS2 *start = (Py_UCS2 *) data;
2581 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2582 }
2583 else {
2584 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002585 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 if (copy_null)
2588 target[len] = 0;
2589 return target;
2590}
2591
2592Py_UCS4*
2593PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2594 int copy_null)
2595{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002596 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 PyErr_BadInternalCall();
2598 return NULL;
2599 }
2600 return as_ucs4(string, target, targetsize, copy_null);
2601}
2602
2603Py_UCS4*
2604PyUnicode_AsUCS4Copy(PyObject *string)
2605{
2606 return as_ucs4(string, NULL, 0, 1);
2607}
2608
Victor Stinner15a11362012-10-06 23:48:20 +02002609/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002610 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2611 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2612#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002613
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614static int
2615unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2616 Py_ssize_t width, Py_ssize_t precision)
2617{
2618 Py_ssize_t length, fill, arglen;
2619 Py_UCS4 maxchar;
2620
2621 if (PyUnicode_READY(str) == -1)
2622 return -1;
2623
2624 length = PyUnicode_GET_LENGTH(str);
2625 if ((precision == -1 || precision >= length)
2626 && width <= length)
2627 return _PyUnicodeWriter_WriteStr(writer, str);
2628
2629 if (precision != -1)
2630 length = Py_MIN(precision, length);
2631
2632 arglen = Py_MAX(length, width);
2633 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2634 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2635 else
2636 maxchar = writer->maxchar;
2637
2638 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2639 return -1;
2640
2641 if (width > length) {
2642 fill = width - length;
2643 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2644 return -1;
2645 writer->pos += fill;
2646 }
2647
2648 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2649 str, 0, length);
2650 writer->pos += length;
2651 return 0;
2652}
2653
2654static int
Victor Stinner998b8062018-09-12 00:23:25 +02002655unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002656 Py_ssize_t width, Py_ssize_t precision)
2657{
2658 /* UTF-8 */
2659 Py_ssize_t length;
2660 PyObject *unicode;
2661 int res;
2662
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002663 if (precision == -1) {
2664 length = strlen(str);
2665 }
2666 else {
2667 length = 0;
2668 while (length < precision && str[length]) {
2669 length++;
2670 }
2671 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2673 if (unicode == NULL)
2674 return -1;
2675
2676 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2677 Py_DECREF(unicode);
2678 return res;
2679}
2680
Victor Stinner96865452011-03-01 23:44:09 +00002681static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002682unicode_fromformat_arg(_PyUnicodeWriter *writer,
2683 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002684{
Victor Stinnere215d962012-10-06 23:03:36 +02002685 const char *p;
2686 Py_ssize_t len;
2687 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 Py_ssize_t width;
2689 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002690 int longflag;
2691 int longlongflag;
2692 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002693 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 p = f;
2696 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002697 zeropad = 0;
2698 if (*f == '0') {
2699 zeropad = 1;
2700 f++;
2701 }
Victor Stinner96865452011-03-01 23:44:09 +00002702
2703 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 width = -1;
2705 if (Py_ISDIGIT((unsigned)*f)) {
2706 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002707 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002708 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002710 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002712 return NULL;
2713 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002715 f++;
2716 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 }
2718 precision = -1;
2719 if (*f == '.') {
2720 f++;
2721 if (Py_ISDIGIT((unsigned)*f)) {
2722 precision = (*f - '0');
2723 f++;
2724 while (Py_ISDIGIT((unsigned)*f)) {
2725 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2726 PyErr_SetString(PyExc_ValueError,
2727 "precision too big");
2728 return NULL;
2729 }
2730 precision = (precision * 10) + (*f - '0');
2731 f++;
2732 }
2733 }
Victor Stinner96865452011-03-01 23:44:09 +00002734 if (*f == '%') {
2735 /* "%.3%s" => f points to "3" */
2736 f--;
2737 }
2738 }
2739 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002740 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002741 f--;
2742 }
Victor Stinner96865452011-03-01 23:44:09 +00002743
2744 /* Handle %ld, %lu, %lld and %llu. */
2745 longflag = 0;
2746 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002747 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002748 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002749 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002750 longflag = 1;
2751 ++f;
2752 }
Victor Stinner96865452011-03-01 23:44:09 +00002753 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002754 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002755 longlongflag = 1;
2756 f += 2;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758 }
2759 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002761 size_tflag = 1;
2762 ++f;
2763 }
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 if (f[1] == '\0')
2766 writer->overallocate = 0;
2767
2768 switch (*f) {
2769 case 'c':
2770 {
2771 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002772 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002773 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002774 "character argument not in range(0x110000)");
2775 return NULL;
2776 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002777 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002778 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002779 break;
2780 }
2781
2782 case 'i':
2783 case 'd':
2784 case 'u':
2785 case 'x':
2786 {
2787 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002788 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002790
2791 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002792 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002793 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002794 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002795 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002796 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002797 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002798 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002799 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002800 va_arg(*vargs, size_t));
2801 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002802 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002803 va_arg(*vargs, unsigned int));
2804 }
2805 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002806 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002807 }
2808 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002809 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002810 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002811 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002812 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002813 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002814 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002815 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002816 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002817 va_arg(*vargs, Py_ssize_t));
2818 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002819 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002820 va_arg(*vargs, int));
2821 }
2822 assert(len >= 0);
2823
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (precision < len)
2825 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826
2827 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002828 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2829 return NULL;
2830
Victor Stinnere215d962012-10-06 23:03:36 +02002831 if (width > precision) {
2832 Py_UCS4 fillchar;
2833 fill = width - precision;
2834 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002835 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2836 return NULL;
2837 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 }
Victor Stinner15a11362012-10-06 23:48:20 +02002839 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002840 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002841 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2842 return NULL;
2843 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002844 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002845
Victor Stinner4a587072013-11-19 12:54:53 +01002846 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849 }
2850
2851 case 'p':
2852 {
2853 char number[MAX_LONG_LONG_CHARS];
2854
2855 len = sprintf(number, "%p", va_arg(*vargs, void*));
2856 assert(len >= 0);
2857
2858 /* %p is ill-defined: ensure leading 0x. */
2859 if (number[1] == 'X')
2860 number[1] = 'x';
2861 else if (number[1] != 'x') {
2862 memmove(number + 2, number,
2863 strlen(number) + 1);
2864 number[0] = '0';
2865 number[1] = 'x';
2866 len += 2;
2867 }
2868
Victor Stinner4a587072013-11-19 12:54:53 +01002869 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002870 return NULL;
2871 break;
2872 }
2873
2874 case 's':
2875 {
2876 /* UTF-8 */
2877 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002878 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002879 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002880 break;
2881 }
2882
2883 case 'U':
2884 {
2885 PyObject *obj = va_arg(*vargs, PyObject *);
2886 assert(obj && _PyUnicode_CHECK(obj));
2887
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002888 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002889 return NULL;
2890 break;
2891 }
2892
2893 case 'V':
2894 {
2895 PyObject *obj = va_arg(*vargs, PyObject *);
2896 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002897 if (obj) {
2898 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002900 return NULL;
2901 }
2902 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002904 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002906 }
2907 break;
2908 }
2909
2910 case 'S':
2911 {
2912 PyObject *obj = va_arg(*vargs, PyObject *);
2913 PyObject *str;
2914 assert(obj);
2915 str = PyObject_Str(obj);
2916 if (!str)
2917 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002919 Py_DECREF(str);
2920 return NULL;
2921 }
2922 Py_DECREF(str);
2923 break;
2924 }
2925
2926 case 'R':
2927 {
2928 PyObject *obj = va_arg(*vargs, PyObject *);
2929 PyObject *repr;
2930 assert(obj);
2931 repr = PyObject_Repr(obj);
2932 if (!repr)
2933 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002934 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002935 Py_DECREF(repr);
2936 return NULL;
2937 }
2938 Py_DECREF(repr);
2939 break;
2940 }
2941
2942 case 'A':
2943 {
2944 PyObject *obj = va_arg(*vargs, PyObject *);
2945 PyObject *ascii;
2946 assert(obj);
2947 ascii = PyObject_ASCII(obj);
2948 if (!ascii)
2949 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 Py_DECREF(ascii);
2952 return NULL;
2953 }
2954 Py_DECREF(ascii);
2955 break;
2956 }
2957
2958 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002959 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002960 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002961 break;
2962
2963 default:
2964 /* if we stumble upon an unknown formatting code, copy the rest
2965 of the format string to the output string. (we cannot just
2966 skip the code, since there's no way to know what's in the
2967 argument list) */
2968 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002969 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002970 return NULL;
2971 f = p+len;
2972 return f;
2973 }
2974
2975 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002976 return f;
2977}
2978
Walter Dörwaldd2034312007-05-18 16:29:38 +00002979PyObject *
2980PyUnicode_FromFormatV(const char *format, va_list vargs)
2981{
Victor Stinnere215d962012-10-06 23:03:36 +02002982 va_list vargs2;
2983 const char *f;
2984 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002985
Victor Stinner8f674cc2013-04-17 23:02:17 +02002986 _PyUnicodeWriter_Init(&writer);
2987 writer.min_length = strlen(format) + 100;
2988 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002989
Benjamin Peterson0c212142016-09-20 20:39:33 -07002990 // Copy varags to be able to pass a reference to a subfunction.
2991 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002992
2993 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002994 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002995 f = unicode_fromformat_arg(&writer, f, &vargs2);
2996 if (f == NULL)
2997 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003000 const char *p;
3001 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002
Victor Stinnere215d962012-10-06 23:03:36 +02003003 p = f;
3004 do
3005 {
3006 if ((unsigned char)*p > 127) {
3007 PyErr_Format(PyExc_ValueError,
3008 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3009 "string, got a non-ASCII byte: 0x%02x",
3010 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003011 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003012 }
3013 p++;
3014 }
3015 while (*p != '\0' && *p != '%');
3016 len = p - f;
3017
3018 if (*p == '\0')
3019 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003020
3021 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003022 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003023
3024 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003026 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003027 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003028 return _PyUnicodeWriter_Finish(&writer);
3029
3030 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003031 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003032 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003033 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003034}
3035
Walter Dörwaldd2034312007-05-18 16:29:38 +00003036PyObject *
3037PyUnicode_FromFormat(const char *format, ...)
3038{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003039 PyObject* ret;
3040 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003041
3042#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003043 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003044#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003046#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003047 ret = PyUnicode_FromFormatV(format, vargs);
3048 va_end(vargs);
3049 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050}
3051
Serhiy Storchakac46db922018-10-23 22:58:24 +03003052static Py_ssize_t
3053unicode_get_widechar_size(PyObject *unicode)
3054{
3055 Py_ssize_t res;
3056
3057 assert(unicode != NULL);
3058 assert(_PyUnicode_CHECK(unicode));
3059
3060 if (_PyUnicode_WSTR(unicode) != NULL) {
3061 return PyUnicode_WSTR_LENGTH(unicode);
3062 }
3063 assert(PyUnicode_IS_READY(unicode));
3064
3065 res = _PyUnicode_LENGTH(unicode);
3066#if SIZEOF_WCHAR_T == 2
3067 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3068 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3069 const Py_UCS4 *end = s + res;
3070 for (; s < end; ++s) {
3071 if (*s > 0xFFFF) {
3072 ++res;
3073 }
3074 }
3075 }
3076#endif
3077 return res;
3078}
3079
3080static void
3081unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3082{
3083 const wchar_t *wstr;
3084
3085 assert(unicode != NULL);
3086 assert(_PyUnicode_CHECK(unicode));
3087
3088 wstr = _PyUnicode_WSTR(unicode);
3089 if (wstr != NULL) {
3090 memcpy(w, wstr, size * sizeof(wchar_t));
3091 return;
3092 }
3093 assert(PyUnicode_IS_READY(unicode));
3094
3095 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3096 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3097 for (; size--; ++s, ++w) {
3098 *w = *s;
3099 }
3100 }
3101 else {
3102#if SIZEOF_WCHAR_T == 4
3103 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3104 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3105 for (; size--; ++s, ++w) {
3106 *w = *s;
3107 }
3108#else
3109 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3110 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3111 for (; size--; ++s, ++w) {
3112 Py_UCS4 ch = *s;
3113 if (ch > 0xFFFF) {
3114 assert(ch <= MAX_UNICODE);
3115 /* encode surrogate pair in this case */
3116 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3117 if (!size--)
3118 break;
3119 *w = Py_UNICODE_LOW_SURROGATE(ch);
3120 }
3121 else {
3122 *w = ch;
3123 }
3124 }
3125#endif
3126 }
3127}
3128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003129#ifdef HAVE_WCHAR_H
3130
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003131/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003132
Victor Stinnerd88d9832011-09-06 02:00:05 +02003133 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003134 character) required to convert the unicode object. Ignore size argument.
3135
Victor Stinnerd88d9832011-09-06 02:00:05 +02003136 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003137 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003138 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139Py_ssize_t
3140PyUnicode_AsWideChar(PyObject *unicode,
3141 wchar_t *w,
3142 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003143{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003144 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146 if (unicode == NULL) {
3147 PyErr_BadInternalCall();
3148 return -1;
3149 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003150 if (!PyUnicode_Check(unicode)) {
3151 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003153 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003154
3155 res = unicode_get_widechar_size(unicode);
3156 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003158 }
3159
3160 if (size > res) {
3161 size = res + 1;
3162 }
3163 else {
3164 res = size;
3165 }
3166 unicode_copy_as_widechar(unicode, w, size);
3167 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003168}
3169
Victor Stinner137c34c2010-09-29 10:25:54 +00003170wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003171PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003172 Py_ssize_t *size)
3173{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003174 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003175 Py_ssize_t buflen;
3176
3177 if (unicode == NULL) {
3178 PyErr_BadInternalCall();
3179 return NULL;
3180 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003181 if (!PyUnicode_Check(unicode)) {
3182 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003183 return NULL;
3184 }
3185
Serhiy Storchakac46db922018-10-23 22:58:24 +03003186 buflen = unicode_get_widechar_size(unicode);
3187 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003188 if (buffer == NULL) {
3189 PyErr_NoMemory();
3190 return NULL;
3191 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003192 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3193 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003194 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003195 }
3196 else if (wcslen(buffer) != (size_t)buflen) {
3197 PyMem_FREE(buffer);
3198 PyErr_SetString(PyExc_ValueError,
3199 "embedded null character");
3200 return NULL;
3201 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003202 return buffer;
3203}
3204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003205#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003209{
Victor Stinner8faf8212011-12-08 22:14:11 +01003210 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 PyErr_SetString(PyExc_ValueError,
3212 "chr() arg not in range(0x110000)");
3213 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003214 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003215
Victor Stinner985a82a2014-01-03 12:53:47 +01003216 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003217}
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003220PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003222 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003224 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003225 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003226 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 Py_INCREF(obj);
3228 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003229 }
3230 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 /* For a Unicode subtype that's not a Unicode object,
3232 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003233 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003234 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003235 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003236 "Can't convert '%.100s' object to str implicitly",
3237 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003238 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003239}
3240
Alexander Belopolsky40018472011-02-26 01:02:56 +00003241PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003242PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003243 const char *encoding,
3244 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003245{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003246 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003247 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 PyErr_BadInternalCall();
3251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003253
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003254 /* Decoding bytes objects is the most common case and should be fast */
3255 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003256 if (PyBytes_GET_SIZE(obj) == 0) {
3257 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3258 return NULL;
3259 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003260 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003261 }
3262 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003263 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3264 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003265 }
3266
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003267 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 PyErr_SetString(PyExc_TypeError,
3269 "decoding str is not supported");
3270 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003271 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003272
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003273 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3274 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3275 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003276 "decoding to str: need a bytes-like object, %.80s found",
3277 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003278 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003279 }
Tim Petersced69f82003-09-16 20:30:58 +00003280
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003281 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003282 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003283 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3284 return NULL;
3285 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003286 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003288
Serhiy Storchaka05997252013-01-26 12:14:02 +02003289 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003290 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003291 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292}
3293
Victor Stinnerebe17e02016-10-12 13:57:45 +02003294/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3295 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3296 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003297int
3298_Py_normalize_encoding(const char *encoding,
3299 char *lower,
3300 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003302 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003303 char *l;
3304 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003305 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306
Victor Stinner942889a2016-09-05 15:40:10 -07003307 assert(encoding != NULL);
3308
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003309 e = encoding;
3310 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003311 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003312 punct = 0;
3313 while (1) {
3314 char c = *e;
3315 if (c == 0) {
3316 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 }
Victor Stinner942889a2016-09-05 15:40:10 -07003318
3319 if (Py_ISALNUM(c) || c == '.') {
3320 if (punct && l != lower) {
3321 if (l == l_end) {
3322 return 0;
3323 }
3324 *l++ = '_';
3325 }
3326 punct = 0;
3327
3328 if (l == l_end) {
3329 return 0;
3330 }
3331 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
3333 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003334 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003335 }
Victor Stinner942889a2016-09-05 15:40:10 -07003336
3337 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003338 }
3339 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003340 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003341}
3342
Alexander Belopolsky40018472011-02-26 01:02:56 +00003343PyObject *
3344PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003345 Py_ssize_t size,
3346 const char *encoding,
3347 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003348{
3349 PyObject *buffer = NULL, *unicode;
3350 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003351 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3352
Victor Stinner22eb6892019-06-26 00:51:05 +02003353 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3354 return NULL;
3355 }
3356
Victor Stinnered076ed2019-06-26 01:49:32 +02003357 if (size == 0) {
3358 _Py_RETURN_UNICODE_EMPTY();
3359 }
3360
Victor Stinner942889a2016-09-05 15:40:10 -07003361 if (encoding == NULL) {
3362 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3363 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003364
Fred Drakee4315f52000-05-09 19:53:39 +00003365 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003366 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3367 char *lower = buflower;
3368
3369 /* Fast paths */
3370 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3371 lower += 3;
3372 if (*lower == '_') {
3373 /* Match "utf8" and "utf_8" */
3374 lower++;
3375 }
3376
3377 if (lower[0] == '8' && lower[1] == 0) {
3378 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3379 }
3380 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3381 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3382 }
3383 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3384 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3385 }
3386 }
3387 else {
3388 if (strcmp(lower, "ascii") == 0
3389 || strcmp(lower, "us_ascii") == 0) {
3390 return PyUnicode_DecodeASCII(s, size, errors);
3391 }
Steve Dowercc16be82016-09-08 10:35:16 -07003392 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003393 else if (strcmp(lower, "mbcs") == 0) {
3394 return PyUnicode_DecodeMBCS(s, size, errors);
3395 }
3396 #endif
3397 else if (strcmp(lower, "latin1") == 0
3398 || strcmp(lower, "latin_1") == 0
3399 || strcmp(lower, "iso_8859_1") == 0
3400 || strcmp(lower, "iso8859_1") == 0) {
3401 return PyUnicode_DecodeLatin1(s, size, errors);
3402 }
3403 }
Victor Stinner37296e82010-06-10 13:36:23 +00003404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405
3406 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003407 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003408 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003409 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003410 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 if (buffer == NULL)
3412 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003413 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 if (unicode == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(unicode)) {
3417 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003418 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003419 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003420 encoding,
3421 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 Py_DECREF(unicode);
3423 goto onError;
3424 }
3425 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003426 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003427
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 Py_XDECREF(buffer);
3430 return NULL;
3431}
3432
Alexander Belopolsky40018472011-02-26 01:02:56 +00003433PyObject *
3434PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003435 const char *encoding,
3436 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003437{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003438 if (!PyUnicode_Check(unicode)) {
3439 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003440 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003441 }
3442
Serhiy Storchaka00939072016-10-27 21:05:49 +03003443 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3444 "PyUnicode_AsDecodedObject() is deprecated; "
3445 "use PyCodec_Decode() to decode from str", 1) < 0)
3446 return NULL;
3447
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003448 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003450
3451 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003452 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453}
3454
Alexander Belopolsky40018472011-02-26 01:02:56 +00003455PyObject *
3456PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003457 const char *encoding,
3458 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003459{
3460 PyObject *v;
3461
3462 if (!PyUnicode_Check(unicode)) {
3463 PyErr_BadArgument();
3464 goto onError;
3465 }
3466
Serhiy Storchaka00939072016-10-27 21:05:49 +03003467 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3468 "PyUnicode_AsDecodedUnicode() is deprecated; "
3469 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3470 return NULL;
3471
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474
3475 /* Decode via the codec registry */
3476 v = PyCodec_Decode(unicode, encoding, errors);
3477 if (v == NULL)
3478 goto onError;
3479 if (!PyUnicode_Check(v)) {
3480 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003481 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003482 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003483 encoding,
3484 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485 Py_DECREF(v);
3486 goto onError;
3487 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003488 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491 return NULL;
3492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494PyObject *
3495PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003496 Py_ssize_t size,
3497 const char *encoding,
3498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
3500 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003501
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003502 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3506 Py_DECREF(unicode);
3507 return v;
3508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 const char *encoding,
3513 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003514{
3515 PyObject *v;
3516
3517 if (!PyUnicode_Check(unicode)) {
3518 PyErr_BadArgument();
3519 goto onError;
3520 }
3521
Serhiy Storchaka00939072016-10-27 21:05:49 +03003522 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3523 "PyUnicode_AsEncodedObject() is deprecated; "
3524 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3525 "or PyCodec_Encode() for generic encoding", 1) < 0)
3526 return NULL;
3527
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003528 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003530
3531 /* Encode via the codec registry */
3532 v = PyCodec_Encode(unicode, encoding, errors);
3533 if (v == NULL)
3534 goto onError;
3535 return v;
3536
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003538 return NULL;
3539}
3540
Victor Stinner1b579672011-12-17 05:47:23 +01003541
Victor Stinner2cba6b82018-01-10 22:46:15 +01003542static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003543unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003544 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003545{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003546 Py_ssize_t wlen;
3547 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3548 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003549 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003550 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003551
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003552 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003553 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003554 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003555 return NULL;
3556 }
3557
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003558 char *str;
3559 size_t error_pos;
3560 const char *reason;
3561 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003562 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003563 PyMem_Free(wstr);
3564
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 if (res != 0) {
3566 if (res == -2) {
3567 PyObject *exc;
3568 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3569 "locale", unicode,
3570 (Py_ssize_t)error_pos,
3571 (Py_ssize_t)(error_pos+1),
3572 reason);
3573 if (exc != NULL) {
3574 PyCodec_StrictErrors(exc);
3575 Py_DECREF(exc);
3576 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003577 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003578 else if (res == -3) {
3579 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3580 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003581 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003582 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003583 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003584 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003585 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003586
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003587 PyObject *bytes = PyBytes_FromString(str);
3588 PyMem_RawFree(str);
3589 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003590}
3591
Victor Stinnerad158722010-10-27 00:25:46 +00003592PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003593PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3594{
Victor Stinner709d23d2019-05-02 14:56:30 -04003595 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3596 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003597}
3598
3599PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003600PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003601{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003602 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003603#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003604 if (interp->fs_codec.encoding) {
3605 return unicode_encode_utf8(unicode,
3606 interp->fs_codec.error_handler,
3607 interp->fs_codec.errors);
3608 }
3609 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003610 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003611 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003612 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003613 assert(errors != _Py_ERROR_UNKNOWN);
3614 return unicode_encode_utf8(unicode, errors, NULL);
3615 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003616#else
Victor Stinner793b5312011-04-27 00:24:21 +02003617 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3618 cannot use it to encode and decode filenames before it is loaded. Load
3619 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003620 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003621 initialized and the Python codec is loaded.
3622 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003623 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003624 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003625 interp->fs_codec.encoding,
3626 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003627 }
3628 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003629 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003630 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003631 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003632 assert(errors != _Py_ERROR_UNKNOWN);
3633 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003634 }
Victor Stinnerad158722010-10-27 00:25:46 +00003635#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003636}
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638PyObject *
3639PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 const char *encoding,
3641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
3643 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003644 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 if (!PyUnicode_Check(unicode)) {
3647 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 }
Fred Drakee4315f52000-05-09 19:53:39 +00003650
Victor Stinner22eb6892019-06-26 00:51:05 +02003651 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3652 return NULL;
3653 }
3654
Victor Stinner942889a2016-09-05 15:40:10 -07003655 if (encoding == NULL) {
3656 return _PyUnicode_AsUTF8String(unicode, errors);
3657 }
3658
Fred Drakee4315f52000-05-09 19:53:39 +00003659 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003660 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3661 char *lower = buflower;
3662
3663 /* Fast paths */
3664 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3665 lower += 3;
3666 if (*lower == '_') {
3667 /* Match "utf8" and "utf_8" */
3668 lower++;
3669 }
3670
3671 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003673 }
3674 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3675 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3676 }
3677 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3678 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3679 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003680 }
Victor Stinner942889a2016-09-05 15:40:10 -07003681 else {
3682 if (strcmp(lower, "ascii") == 0
3683 || strcmp(lower, "us_ascii") == 0) {
3684 return _PyUnicode_AsASCIIString(unicode, errors);
3685 }
Steve Dowercc16be82016-09-08 10:35:16 -07003686#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003687 else if (strcmp(lower, "mbcs") == 0) {
3688 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3689 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003690#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003691 else if (strcmp(lower, "latin1") == 0 ||
3692 strcmp(lower, "latin_1") == 0 ||
3693 strcmp(lower, "iso_8859_1") == 0 ||
3694 strcmp(lower, "iso8859_1") == 0) {
3695 return _PyUnicode_AsLatin1String(unicode, errors);
3696 }
3697 }
Victor Stinner37296e82010-06-10 13:36:23 +00003698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
3700 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003701 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003703 return NULL;
3704
3705 /* The normal path */
3706 if (PyBytes_Check(v))
3707 return v;
3708
3709 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003710 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003711 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003712 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003713
3714 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003715 "encoder %s returned bytearray instead of bytes; "
3716 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003717 encoding);
3718 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003719 Py_DECREF(v);
3720 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003721 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003722
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003723 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3724 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003725 Py_DECREF(v);
3726 return b;
3727 }
3728
3729 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003730 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003731 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003732 encoding,
3733 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003734 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003735 return NULL;
3736}
3737
Alexander Belopolsky40018472011-02-26 01:02:56 +00003738PyObject *
3739PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003740 const char *encoding,
3741 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003742{
3743 PyObject *v;
3744
3745 if (!PyUnicode_Check(unicode)) {
3746 PyErr_BadArgument();
3747 goto onError;
3748 }
3749
Serhiy Storchaka00939072016-10-27 21:05:49 +03003750 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3751 "PyUnicode_AsEncodedUnicode() is deprecated; "
3752 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3753 return NULL;
3754
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003755 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003757
3758 /* Encode via the codec registry */
3759 v = PyCodec_Encode(unicode, encoding, errors);
3760 if (v == NULL)
3761 goto onError;
3762 if (!PyUnicode_Check(v)) {
3763 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003764 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003765 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003766 encoding,
3767 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003768 Py_DECREF(v);
3769 goto onError;
3770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003772
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 return NULL;
3775}
3776
Victor Stinner2cba6b82018-01-10 22:46:15 +01003777static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003778unicode_decode_locale(const char *str, Py_ssize_t len,
3779 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003780{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003781 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3782 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003783 return NULL;
3784 }
3785
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003786 wchar_t *wstr;
3787 size_t wlen;
3788 const char *reason;
3789 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003790 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003791 if (res != 0) {
3792 if (res == -2) {
3793 PyObject *exc;
3794 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3795 "locale", str, len,
3796 (Py_ssize_t)wlen,
3797 (Py_ssize_t)(wlen + 1),
3798 reason);
3799 if (exc != NULL) {
3800 PyCodec_StrictErrors(exc);
3801 Py_DECREF(exc);
3802 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003803 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003804 else if (res == -3) {
3805 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3806 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003807 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003808 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003809 }
Victor Stinner2f197072011-12-17 07:08:30 +01003810 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003811 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003812
3813 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3814 PyMem_RawFree(wstr);
3815 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003816}
3817
3818PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003819PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3820 const char *errors)
3821{
Victor Stinner709d23d2019-05-02 14:56:30 -04003822 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3823 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003824}
3825
3826PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003827PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003828{
3829 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003830 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3831 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003832}
3833
3834
3835PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003836PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003838 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3839}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003840
Christian Heimes5894ba72007-11-04 11:43:14 +00003841PyObject*
3842PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3843{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003844 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003845#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003846 if (interp->fs_codec.encoding) {
3847 return unicode_decode_utf8(s, size,
3848 interp->fs_codec.error_handler,
3849 interp->fs_codec.errors,
3850 NULL);
3851 }
3852 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003853 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003855 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003856 assert(errors != _Py_ERROR_UNKNOWN);
3857 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3858 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003859#else
Victor Stinner793b5312011-04-27 00:24:21 +02003860 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3861 cannot use it to encode and decode filenames before it is loaded. Load
3862 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003863 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003864 initialized and the Python codec is loaded.
3865 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003866 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003867 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003868 interp->fs_codec.encoding,
3869 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003870 }
3871 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003872 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003873 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003874 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003875 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003876 }
Victor Stinnerad158722010-10-27 00:25:46 +00003877#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003878}
3879
Martin v. Löwis011e8422009-05-05 04:43:17 +00003880
3881int
3882PyUnicode_FSConverter(PyObject* arg, void* addr)
3883{
Brett Cannonec6ce872016-09-06 15:50:29 -07003884 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003885 PyObject *output = NULL;
3886 Py_ssize_t size;
3887 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003888 if (arg == NULL) {
3889 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003890 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 return 1;
3892 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003893 path = PyOS_FSPath(arg);
3894 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003895 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003897 if (PyBytes_Check(path)) {
3898 output = path;
3899 }
3900 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3901 output = PyUnicode_EncodeFSDefault(path);
3902 Py_DECREF(path);
3903 if (!output) {
3904 return 0;
3905 }
3906 assert(PyBytes_Check(output));
3907 }
3908
Victor Stinner0ea2a462010-04-30 00:22:08 +00003909 size = PyBytes_GET_SIZE(output);
3910 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003911 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003912 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003913 Py_DECREF(output);
3914 return 0;
3915 }
3916 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003917 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003918}
3919
3920
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003921int
3922PyUnicode_FSDecoder(PyObject* arg, void* addr)
3923{
Brett Cannona5711202016-09-06 19:36:01 -07003924 int is_buffer = 0;
3925 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 if (arg == NULL) {
3928 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003929 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003930 return 1;
3931 }
Brett Cannona5711202016-09-06 19:36:01 -07003932
3933 is_buffer = PyObject_CheckBuffer(arg);
3934 if (!is_buffer) {
3935 path = PyOS_FSPath(arg);
3936 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003937 return 0;
3938 }
Brett Cannona5711202016-09-06 19:36:01 -07003939 }
3940 else {
3941 path = arg;
3942 Py_INCREF(arg);
3943 }
3944
3945 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003946 output = path;
3947 }
3948 else if (PyBytes_Check(path) || is_buffer) {
3949 PyObject *path_bytes = NULL;
3950
3951 if (!PyBytes_Check(path) &&
3952 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003953 "path should be string, bytes, or os.PathLike, not %.200s",
3954 Py_TYPE(arg)->tp_name)) {
3955 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003956 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003957 }
3958 path_bytes = PyBytes_FromObject(path);
3959 Py_DECREF(path);
3960 if (!path_bytes) {
3961 return 0;
3962 }
3963 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3964 PyBytes_GET_SIZE(path_bytes));
3965 Py_DECREF(path_bytes);
3966 if (!output) {
3967 return 0;
3968 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003969 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003970 else {
3971 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003972 "path should be string, bytes, or os.PathLike, not %.200s",
3973 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003974 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003975 return 0;
3976 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003977 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003978 Py_DECREF(output);
3979 return 0;
3980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003982 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003983 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003984 Py_DECREF(output);
3985 return 0;
3986 }
3987 *(PyObject**)addr = output;
3988 return Py_CLEANUP_SUPPORTED;
3989}
3990
3991
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003992const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003994{
Christian Heimesf3863112007-11-22 07:46:41 +00003995 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 return NULL;
4000 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004002 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004004 if (PyUnicode_UTF8(unicode) == NULL) {
4005 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004006 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 if (bytes == NULL)
4008 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4010 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004011 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 Py_DECREF(bytes);
4013 return NULL;
4014 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004016 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 PyBytes_AS_STRING(bytes),
4018 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_DECREF(bytes);
4020 }
4021
4022 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004023 *psize = PyUnicode_UTF8_LENGTH(unicode);
4024 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004025}
4026
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004027const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4031}
4032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033Py_UNICODE *
4034PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004040 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4041 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004043 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004044 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045
Serhiy Storchakac46db922018-10-23 22:58:24 +03004046 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4047 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4048 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004051 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4052 if (w == NULL) {
4053 PyErr_NoMemory();
4054 return NULL;
4055 }
4056 unicode_copy_as_widechar(unicode, w, wlen + 1);
4057 _PyUnicode_WSTR(unicode) = w;
4058 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4059 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 }
4061 }
4062 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004064 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004065}
4066
Alexander Belopolsky40018472011-02-26 01:02:56 +00004067Py_UNICODE *
4068PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071}
4072
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004073const Py_UNICODE *
4074_PyUnicode_AsUnicode(PyObject *unicode)
4075{
4076 Py_ssize_t size;
4077 const Py_UNICODE *wstr;
4078
4079 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4080 if (wstr && wcslen(wstr) != (size_t)size) {
4081 PyErr_SetString(PyExc_ValueError, "embedded null character");
4082 return NULL;
4083 }
4084 return wstr;
4085}
4086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087
Alexander Belopolsky40018472011-02-26 01:02:56 +00004088Py_ssize_t
4089PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090{
4091 if (!PyUnicode_Check(unicode)) {
4092 PyErr_BadArgument();
4093 goto onError;
4094 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004095 if (_PyUnicode_WSTR(unicode) == NULL) {
4096 if (PyUnicode_AsUnicode(unicode) == NULL)
4097 goto onError;
4098 }
4099 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 return -1;
4103}
4104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105Py_ssize_t
4106PyUnicode_GetLength(PyObject *unicode)
4107{
Victor Stinner07621332012-06-16 04:53:46 +02004108 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 PyErr_BadArgument();
4110 return -1;
4111 }
Victor Stinner07621332012-06-16 04:53:46 +02004112 if (PyUnicode_READY(unicode) == -1)
4113 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 return PyUnicode_GET_LENGTH(unicode);
4115}
4116
4117Py_UCS4
4118PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4119{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004120 void *data;
4121 int kind;
4122
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004123 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004124 PyErr_BadArgument();
4125 return (Py_UCS4)-1;
4126 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004127 if (PyUnicode_READY(unicode) == -1) {
4128 return (Py_UCS4)-1;
4129 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004130 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004131 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 return (Py_UCS4)-1;
4133 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004134 data = PyUnicode_DATA(unicode);
4135 kind = PyUnicode_KIND(unicode);
4136 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137}
4138
4139int
4140PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4141{
4142 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004143 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 return -1;
4145 }
Victor Stinner488fa492011-12-12 00:01:39 +01004146 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004147 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004148 PyErr_SetString(PyExc_IndexError, "string index out of range");
4149 return -1;
4150 }
Victor Stinner488fa492011-12-12 00:01:39 +01004151 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004152 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004153 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4154 PyErr_SetString(PyExc_ValueError, "character out of range");
4155 return -1;
4156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4158 index, ch);
4159 return 0;
4160}
4161
Alexander Belopolsky40018472011-02-26 01:02:56 +00004162const char *
4163PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004164{
Victor Stinner42cb4622010-09-01 19:39:01 +00004165 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004166}
4167
Victor Stinner554f3f02010-06-16 23:33:54 +00004168/* create or adjust a UnicodeDecodeError */
4169static void
4170make_decode_exception(PyObject **exceptionObject,
4171 const char *encoding,
4172 const char *input, Py_ssize_t length,
4173 Py_ssize_t startpos, Py_ssize_t endpos,
4174 const char *reason)
4175{
4176 if (*exceptionObject == NULL) {
4177 *exceptionObject = PyUnicodeDecodeError_Create(
4178 encoding, input, length, startpos, endpos, reason);
4179 }
4180 else {
4181 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4182 goto onError;
4183 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4186 goto onError;
4187 }
4188 return;
4189
4190onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004191 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004192}
4193
Steve Dowercc16be82016-09-08 10:35:16 -07004194#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004195static int
4196widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4197{
4198 if (newsize > *size) {
4199 wchar_t *newbuf = *buf;
4200 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4201 PyErr_NoMemory();
4202 return -1;
4203 }
4204 *buf = newbuf;
4205 }
4206 *size = newsize;
4207 return 0;
4208}
4209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210/* error handling callback helper:
4211 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004212 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 and adjust various state variables.
4214 return 0 on success, -1 on error
4215*/
4216
Alexander Belopolsky40018472011-02-26 01:02:56 +00004217static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218unicode_decode_call_errorhandler_wchar(
4219 const char *errors, PyObject **errorHandler,
4220 const char *encoding, const char *reason,
4221 const char **input, const char **inend, Py_ssize_t *startinpos,
4222 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004223 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004225 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226
4227 PyObject *restuple = NULL;
4228 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004229 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004230 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004231 Py_ssize_t requiredsize;
4232 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004233 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 wchar_t *repwstr;
4235 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236
4237 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 *errorHandler = PyCodec_LookupError(errors);
4239 if (*errorHandler == NULL)
4240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 }
4242
Victor Stinner554f3f02010-06-16 23:33:54 +00004243 make_decode_exception(exceptionObject,
4244 encoding,
4245 *input, *inend - *input,
4246 *startinpos, *endinpos,
4247 reason);
4248 if (*exceptionObject == NULL)
4249 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004251 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004255 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004258 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004260
4261 /* Copy back the bytes variables, which might have been modified by the
4262 callback */
4263 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4264 if (!inputobj)
4265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266 *input = PyBytes_AS_STRING(inputobj);
4267 insize = PyBytes_GET_SIZE(inputobj);
4268 *inend = *input + insize;
4269 /* we can DECREF safely, as the exception has another reference,
4270 so the object won't go away. */
4271 Py_DECREF(inputobj);
4272
4273 if (newpos<0)
4274 newpos = insize+newpos;
4275 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004276 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277 goto onError;
4278 }
4279
4280 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4281 if (repwstr == NULL)
4282 goto onError;
4283 /* need more space? (at least enough for what we
4284 have+the replacement+the rest of the string (starting
4285 at the new input position), so we won't have to check space
4286 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004287 requiredsize = *outpos;
4288 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4289 goto overflow;
4290 requiredsize += repwlen;
4291 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4292 goto overflow;
4293 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004294 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004296 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004298 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004300 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004302 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 *endinpos = newpos;
4305 *inptr = *input + newpos;
4306
4307 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004308 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 return 0;
4310
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004311 overflow:
4312 PyErr_SetString(PyExc_OverflowError,
4313 "decoded result is too long for a Python string");
4314
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 onError:
4316 Py_XDECREF(restuple);
4317 return -1;
4318}
Steve Dowercc16be82016-09-08 10:35:16 -07004319#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320
4321static int
4322unicode_decode_call_errorhandler_writer(
4323 const char *errors, PyObject **errorHandler,
4324 const char *encoding, const char *reason,
4325 const char **input, const char **inend, Py_ssize_t *startinpos,
4326 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4327 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4328{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004329 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330
4331 PyObject *restuple = NULL;
4332 PyObject *repunicode = NULL;
4333 Py_ssize_t insize;
4334 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004335 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004336 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004338 int need_to_grow = 0;
4339 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340
4341 if (*errorHandler == NULL) {
4342 *errorHandler = PyCodec_LookupError(errors);
4343 if (*errorHandler == NULL)
4344 goto onError;
4345 }
4346
4347 make_decode_exception(exceptionObject,
4348 encoding,
4349 *input, *inend - *input,
4350 *startinpos, *endinpos,
4351 reason);
4352 if (*exceptionObject == NULL)
4353 goto onError;
4354
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004355 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 if (restuple == NULL)
4357 goto onError;
4358 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004359 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 goto onError;
4361 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004362 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004363 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004364
4365 /* Copy back the bytes variables, which might have been modified by the
4366 callback */
4367 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4368 if (!inputobj)
4369 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004370 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004371 *input = PyBytes_AS_STRING(inputobj);
4372 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004373 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004374 /* we can DECREF safely, as the exception has another reference,
4375 so the object won't go away. */
4376 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004380 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004381 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384
Victor Stinner170ca6f2013-04-18 00:25:28 +02004385 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004386 if (replen > 1) {
4387 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004388 need_to_grow = 1;
4389 }
4390 new_inptr = *input + newpos;
4391 if (*inend - new_inptr > remain) {
4392 /* We don't know the decoding algorithm here so we make the worst
4393 assumption that one byte decodes to one unicode character.
4394 If unfortunately one byte could decode to more unicode characters,
4395 the decoder may write out-of-bound then. Is it possible for the
4396 algorithms using this function? */
4397 writer->min_length += *inend - new_inptr - remain;
4398 need_to_grow = 1;
4399 }
4400 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004401 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004402 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004403 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4404 goto onError;
4405 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004407 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004410 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004413 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004414 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419}
4420
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421/* --- UTF-7 Codec -------------------------------------------------------- */
4422
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423/* See RFC2152 for details. We encode conservatively and decode liberally. */
4424
4425/* Three simple macros defining base-64. */
4426
4427/* Is c a base-64 character? */
4428
4429#define IS_BASE64(c) \
4430 (((c) >= 'A' && (c) <= 'Z') || \
4431 ((c) >= 'a' && (c) <= 'z') || \
4432 ((c) >= '0' && (c) <= '9') || \
4433 (c) == '+' || (c) == '/')
4434
4435/* given that c is a base-64 character, what is its base-64 value? */
4436
4437#define FROM_BASE64(c) \
4438 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4439 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4440 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4441 (c) == '+' ? 62 : 63)
4442
4443/* What is the base-64 character of the bottom 6 bits of n? */
4444
4445#define TO_BASE64(n) \
4446 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4447
4448/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4449 * decoded as itself. We are permissive on decoding; the only ASCII
4450 * byte not decoding to itself is the + which begins a base64
4451 * string. */
4452
4453#define DECODE_DIRECT(c) \
4454 ((c) <= 127 && (c) != '+')
4455
4456/* The UTF-7 encoder treats ASCII characters differently according to
4457 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4458 * the above). See RFC2152. This array identifies these different
4459 * sets:
4460 * 0 : "Set D"
4461 * alphanumeric and '(),-./:?
4462 * 1 : "Set O"
4463 * !"#$%&*;<=>@[]^_`{|}
4464 * 2 : "whitespace"
4465 * ht nl cr sp
4466 * 3 : special (must be base64 encoded)
4467 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4468 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469
Tim Petersced69f82003-09-16 20:30:58 +00004470static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471char utf7_category[128] = {
4472/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4473 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4474/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4475 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4476/* sp ! " # $ % & ' ( ) * + , - . / */
4477 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4478/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4480/* @ A B C D E F G H I J K L M N O */
4481 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4482/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4484/* ` a b c d e f g h i j k l m n o */
4485 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4486/* p q r s t u v w x y z { | } ~ del */
4487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488};
4489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490/* ENCODE_DIRECT: this character should be encoded as itself. The
4491 * answer depends on whether we are encoding set O as itself, and also
4492 * on whether we are encoding whitespace as itself. RFC2152 makes it
4493 * clear that the answers to these questions vary between
4494 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004495
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496#define ENCODE_DIRECT(c, directO, directWS) \
4497 ((c) < 128 && (c) > 0 && \
4498 ((utf7_category[(c)] == 0) || \
4499 (directWS && (utf7_category[(c)] == 2)) || \
4500 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502PyObject *
4503PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004504 Py_ssize_t size,
4505 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004507 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4508}
4509
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510/* The decoder. The only state we preserve is our read position,
4511 * i.e. how many characters we have consumed. So if we end in the
4512 * middle of a shift sequence we have to back off the read position
4513 * and the output to the beginning of the sequence, otherwise we lose
4514 * all the shift state (seen bits, number of bits seen, high
4515 * surrogate). */
4516
Alexander Belopolsky40018472011-02-26 01:02:56 +00004517PyObject *
4518PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004519 Py_ssize_t size,
4520 const char *errors,
4521 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004524 Py_ssize_t startinpos;
4525 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 const char *errmsg = "";
4529 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004530 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 unsigned int base64bits = 0;
4532 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004533 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 PyObject *errorHandler = NULL;
4535 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537 if (size == 0) {
4538 if (consumed)
4539 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004540 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004544 _PyUnicodeWriter_Init(&writer);
4545 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546
4547 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 e = s + size;
4549
4550 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004551 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004553 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 if (inShift) { /* in a base-64 section */
4556 if (IS_BASE64(ch)) { /* consume a base-64 character */
4557 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4558 base64bits += 6;
4559 s++;
4560 if (base64bits >= 16) {
4561 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004562 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 base64bits -= 16;
4564 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004565 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (surrogate) {
4567 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004568 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4569 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004570 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004571 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004573 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 }
4575 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004576 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004577 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
4580 }
Victor Stinner551ac952011-11-29 22:58:13 +01004581 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* first surrogate */
4583 surrogate = outCh;
4584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004586 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004587 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 }
4589 }
4590 }
4591 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (base64bits > 0) { /* left-over bits */
4594 if (base64bits >= 6) {
4595 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 errmsg = "partial character in shift sequence";
4598 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 else {
4601 /* Some bits remain; they should be zero */
4602 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004603 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 errmsg = "non-zero padding bits in shift sequence";
4605 goto utf7Error;
4606 }
4607 }
4608 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004609 if (surrogate && DECODE_DIRECT(ch)) {
4610 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4611 goto onError;
4612 }
4613 surrogate = 0;
4614 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 /* '-' is absorbed; other terminating
4616 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004617 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 }
4620 }
4621 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 s++; /* consume '+' */
4624 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004626 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004627 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004629 else if (s < e && !IS_BASE64(*s)) {
4630 s++;
4631 errmsg = "ill-formed sequence";
4632 goto utf7Error;
4633 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004637 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004639 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 }
4641 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004645 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 else {
4648 startinpos = s-starts;
4649 s++;
4650 errmsg = "unexpected special character";
4651 goto utf7Error;
4652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004656 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 errors, &errorHandler,
4658 "utf7", errmsg,
4659 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004660 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
4663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* end of string */
4665
4666 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4667 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004668 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 if (surrogate ||
4670 (base64bits >= 6) ||
4671 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 errors, &errorHandler,
4675 "utf7", "unterminated shift sequence",
4676 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004677 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 goto onError;
4679 if (s < e)
4680 goto restart;
4681 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683
4684 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004685 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004688 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004689 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004690 writer.kind, writer.data, shiftOutStart);
4691 Py_XDECREF(errorHandler);
4692 Py_XDECREF(exc);
4693 _PyUnicodeWriter_Dealloc(&writer);
4694 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004695 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004696 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
4698 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004699 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 Py_XDECREF(errorHandler);
4704 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004705 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 Py_XDECREF(errorHandler);
4709 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 return NULL;
4712}
4713
4714
Alexander Belopolsky40018472011-02-26 01:02:56 +00004715PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716_PyUnicode_EncodeUTF7(PyObject *str,
4717 int base64SetO,
4718 int base64WhiteSpace,
4719 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004721 int kind;
4722 void *data;
4723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004724 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004726 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 unsigned int base64bits = 0;
4728 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729 char * out;
4730 char * start;
4731
Benjamin Petersonbac79492012-01-14 13:34:47 -05004732 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004733 return NULL;
4734 kind = PyUnicode_KIND(str);
4735 data = PyUnicode_DATA(str);
4736 len = PyUnicode_GET_LENGTH(str);
4737
4738 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004742 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004743 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004744 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 if (v == NULL)
4746 return NULL;
4747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004749 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004750 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 if (inShift) {
4753 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4754 /* shifting out */
4755 if (base64bits) { /* output remaining bits */
4756 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4757 base64buffer = 0;
4758 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 }
4760 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 /* Characters not in the BASE64 set implicitly unshift the sequence
4762 so no '-' is required, except if the character is itself a '-' */
4763 if (IS_BASE64(ch) || ch == '-') {
4764 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 *out++ = (char) ch;
4767 }
4768 else {
4769 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004770 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 else { /* not in a shift sequence */
4773 if (ch == '+') {
4774 *out++ = '+';
4775 *out++ = '-';
4776 }
4777 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4778 *out++ = (char) ch;
4779 }
4780 else {
4781 *out++ = '+';
4782 inShift = 1;
4783 goto encode_char;
4784 }
4785 }
4786 continue;
4787encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004789 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004790
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 /* code first surrogate */
4792 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004793 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 while (base64bits >= 6) {
4795 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4796 base64bits -= 6;
4797 }
4798 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004799 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 base64bits += 16;
4802 base64buffer = (base64buffer << 16) | ch;
4803 while (base64bits >= 6) {
4804 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4805 base64bits -= 6;
4806 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 if (base64bits)
4809 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4810 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004812 if (_PyBytes_Resize(&v, out - start) < 0)
4813 return NULL;
4814 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004815}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004816PyObject *
4817PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4818 Py_ssize_t size,
4819 int base64SetO,
4820 int base64WhiteSpace,
4821 const char *errors)
4822{
4823 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004824 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004825 if (tmp == NULL)
4826 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004827 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004828 base64WhiteSpace, errors);
4829 Py_DECREF(tmp);
4830 return result;
4831}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004832
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833#undef IS_BASE64
4834#undef FROM_BASE64
4835#undef TO_BASE64
4836#undef DECODE_DIRECT
4837#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004838
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839/* --- UTF-8 Codec -------------------------------------------------------- */
4840
Alexander Belopolsky40018472011-02-26 01:02:56 +00004841PyObject *
4842PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004843 Py_ssize_t size,
4844 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Walter Dörwald69652032004-09-07 20:24:22 +00004846 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4847}
4848
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#include "stringlib/asciilib.h"
4850#include "stringlib/codecs.h"
4851#include "stringlib/undef.h"
4852
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004853#include "stringlib/ucs1lib.h"
4854#include "stringlib/codecs.h"
4855#include "stringlib/undef.h"
4856
4857#include "stringlib/ucs2lib.h"
4858#include "stringlib/codecs.h"
4859#include "stringlib/undef.h"
4860
4861#include "stringlib/ucs4lib.h"
4862#include "stringlib/codecs.h"
4863#include "stringlib/undef.h"
4864
Antoine Pitrouab868312009-01-10 15:40:25 +00004865/* Mask to quickly check whether a C 'long' contains a
4866 non-ASCII, UTF8-encoded char. */
4867#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004868# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004869#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004870# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004871#else
4872# error C 'long' size should be either 4 or 8!
4873#endif
4874
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875static Py_ssize_t
4876ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004879 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004881 /*
4882 * Issue #17237: m68k is a bit different from most architectures in
4883 * that objects do not use "natural alignment" - for example, int and
4884 * long are only aligned at 2-byte boundaries. Therefore the assert()
4885 * won't work; also, tests have shown that skipping the "optimised
4886 * version" will even speed up m68k.
4887 */
4888#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004890 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4891 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 /* Fast path, see in STRINGLIB(utf8_decode) for
4893 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004894 /* Help allocation */
4895 const char *_p = p;
4896 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897 while (_p < aligned_end) {
4898 unsigned long value = *(const unsigned long *) _p;
4899 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 *((unsigned long *)q) = value;
4902 _p += SIZEOF_LONG;
4903 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004904 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 p = _p;
4906 while (p < end) {
4907 if ((unsigned char)*p & 0x80)
4908 break;
4909 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004914#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 while (p < end) {
4916 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4917 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004918 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004919 /* Help allocation */
4920 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 while (_p < aligned_end) {
4922 unsigned long value = *(unsigned long *) _p;
4923 if (value & ASCII_CHAR_MASK)
4924 break;
4925 _p += SIZEOF_LONG;
4926 }
4927 p = _p;
4928 if (_p == end)
4929 break;
4930 }
4931 if ((unsigned char)*p & 0x80)
4932 break;
4933 ++p;
4934 }
4935 memcpy(dest, start, p - start);
4936 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937}
Antoine Pitrouab868312009-01-10 15:40:25 +00004938
Victor Stinner709d23d2019-05-02 14:56:30 -04004939static PyObject *
4940unicode_decode_utf8(const char *s, Py_ssize_t size,
4941 _Py_error_handler error_handler, const char *errors,
4942 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004943{
Victor Stinner785938e2011-12-11 20:09:03 +01004944 if (size == 0) {
4945 if (consumed)
4946 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004947 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004948 }
4949
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4951 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004952 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953 *consumed = 1;
4954 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004955 }
4956
Inada Naoki770847a2019-06-24 12:30:24 +09004957 const char *starts = s;
4958 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004959
Inada Naoki770847a2019-06-24 12:30:24 +09004960 // fast path: try ASCII string.
4961 PyObject *u = PyUnicode_New(size, 127);
4962 if (u == NULL) {
4963 return NULL;
4964 }
4965 s += ascii_decode(s, end, PyUnicode_DATA(u));
4966 if (s == end) {
4967 return u;
4968 }
4969
4970 // Use _PyUnicodeWriter after fast path is failed.
4971 _PyUnicodeWriter writer;
4972 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4973 writer.pos = s - starts;
4974
4975 Py_ssize_t startinpos, endinpos;
4976 const char *errmsg = "";
4977 PyObject *error_handler_obj = NULL;
4978 PyObject *exc = NULL;
4979
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 while (s < end) {
4981 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 if (PyUnicode_IS_ASCII(writer.buffer))
4986 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 } else {
4992 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 }
4995
4996 switch (ch) {
4997 case 0:
4998 if (s == end || consumed)
4999 goto End;
5000 errmsg = "unexpected end of data";
5001 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005002 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 break;
5004 case 1:
5005 errmsg = "invalid start byte";
5006 startinpos = s - starts;
5007 endinpos = startinpos + 1;
5008 break;
5009 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005010 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5011 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5012 {
5013 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005014 goto End;
5015 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005016 /* fall through */
5017 case 3:
5018 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 errmsg = "invalid continuation byte";
5020 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005021 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 break;
5023 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005024 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 goto onError;
5026 continue;
5027 }
5028
Victor Stinner1d65d912015-10-05 13:43:50 +02005029 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005030 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005031
5032 switch (error_handler) {
5033 case _Py_ERROR_IGNORE:
5034 s += (endinpos - startinpos);
5035 break;
5036
5037 case _Py_ERROR_REPLACE:
5038 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5039 goto onError;
5040 s += (endinpos - startinpos);
5041 break;
5042
5043 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005044 {
5045 Py_ssize_t i;
5046
Victor Stinner1d65d912015-10-05 13:43:50 +02005047 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5048 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005049 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005050 ch = (Py_UCS4)(unsigned char)(starts[i]);
5051 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5052 ch + 0xdc00);
5053 writer.pos++;
5054 }
5055 s += (endinpos - startinpos);
5056 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005057 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005058
5059 default:
5060 if (unicode_decode_call_errorhandler_writer(
5061 errors, &error_handler_obj,
5062 "utf-8", errmsg,
5063 &starts, &end, &startinpos, &endinpos, &exc, &s,
5064 &writer))
5065 goto onError;
5066 }
Victor Stinner785938e2011-12-11 20:09:03 +01005067 }
5068
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070 if (consumed)
5071 *consumed = s - starts;
5072
Victor Stinner1d65d912015-10-05 13:43:50 +02005073 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005075 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076
5077onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005078 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005082}
5083
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084
Victor Stinner709d23d2019-05-02 14:56:30 -04005085PyObject *
5086PyUnicode_DecodeUTF8Stateful(const char *s,
5087 Py_ssize_t size,
5088 const char *errors,
5089 Py_ssize_t *consumed)
5090{
5091 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5092}
5093
5094
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005095/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5096 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005097
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005098 On success, write a pointer to a newly allocated wide character string into
5099 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5100 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102 On memory allocation failure, return -1.
5103
5104 On decoding error (if surrogateescape is zero), return -2. If wlen is
5105 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5106 is not NULL, write the decoding error message into *reason. */
5107int
5108_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005109 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005111 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 wchar_t *unicode;
5114 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115
Victor Stinner3d4226a2018-08-29 22:21:32 +02005116 int surrogateescape = 0;
5117 int surrogatepass = 0;
5118 switch (errors)
5119 {
5120 case _Py_ERROR_STRICT:
5121 break;
5122 case _Py_ERROR_SURROGATEESCAPE:
5123 surrogateescape = 1;
5124 break;
5125 case _Py_ERROR_SURROGATEPASS:
5126 surrogatepass = 1;
5127 break;
5128 default:
5129 return -3;
5130 }
5131
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 /* Note: size will always be longer than the resulting Unicode
5133 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005134 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005135 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 }
5137
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005138 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005139 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005140 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005141 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142
5143 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 if (ch > 0xFF) {
5154#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005155 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005157 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005158 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5160 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5161#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005162 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005164 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005166 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005167
5168 if (surrogateescape) {
5169 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5170 }
5171 else {
5172 /* Is it a valid three-byte code? */
5173 if (surrogatepass
5174 && (e - s) >= 3
5175 && (s[0] & 0xf0) == 0xe0
5176 && (s[1] & 0xc0) == 0x80
5177 && (s[2] & 0xc0) == 0x80)
5178 {
5179 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5180 s += 3;
5181 unicode[outpos++] = ch;
5182 }
5183 else {
5184 PyMem_RawFree(unicode );
5185 if (reason != NULL) {
5186 switch (ch) {
5187 case 0:
5188 *reason = "unexpected end of data";
5189 break;
5190 case 1:
5191 *reason = "invalid start byte";
5192 break;
5193 /* 2, 3, 4 */
5194 default:
5195 *reason = "invalid continuation byte";
5196 break;
5197 }
5198 }
5199 if (wlen != NULL) {
5200 *wlen = s - orig_s;
5201 }
5202 return -2;
5203 }
5204 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005206 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005208 if (wlen) {
5209 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005210 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005211 *wstr = unicode;
5212 return 0;
5213}
5214
Victor Stinner5f9cf232019-03-19 01:46:25 +01005215
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005216wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005217_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5218 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219{
5220 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005221 int res = _Py_DecodeUTF8Ex(arg, arglen,
5222 &wstr, wlen,
5223 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005224 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005225 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5226 assert(res != -3);
5227 if (wlen) {
5228 *wlen = (size_t)res;
5229 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005230 return NULL;
5231 }
5232 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005233}
5234
Antoine Pitrouab868312009-01-10 15:40:25 +00005235
Victor Stinnere47e6982017-12-21 15:45:16 +01005236/* UTF-8 encoder using the surrogateescape error handler .
5237
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005238 On success, return 0 and write the newly allocated character string (use
5239 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005240
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005241 On encoding failure, return -2 and write the position of the invalid
5242 surrogate character into *error_pos (if error_pos is set) and the decoding
5243 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005244
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 On memory allocation failure, return -1. */
5246int
5247_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005248 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005249{
5250 const Py_ssize_t max_char_size = 4;
5251 Py_ssize_t len = wcslen(text);
5252
5253 assert(len >= 0);
5254
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 int surrogateescape = 0;
5256 int surrogatepass = 0;
5257 switch (errors)
5258 {
5259 case _Py_ERROR_STRICT:
5260 break;
5261 case _Py_ERROR_SURROGATEESCAPE:
5262 surrogateescape = 1;
5263 break;
5264 case _Py_ERROR_SURROGATEPASS:
5265 surrogatepass = 1;
5266 break;
5267 default:
5268 return -3;
5269 }
5270
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005271 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5272 return -1;
5273 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005274 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 if (raw_malloc) {
5276 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005277 }
5278 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005280 }
5281 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005283 }
5284
5285 char *p = bytes;
5286 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005287 for (i = 0; i < len; ) {
5288 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005290 i++;
5291#if Py_UNICODE_SIZE == 2
5292 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5293 && i < len
5294 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5295 {
5296 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5297 i++;
5298 }
5299#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005300
5301 if (ch < 0x80) {
5302 /* Encode ASCII */
5303 *p++ = (char) ch;
5304
5305 }
5306 else if (ch < 0x0800) {
5307 /* Encode Latin-1 */
5308 *p++ = (char)(0xc0 | (ch >> 6));
5309 *p++ = (char)(0x80 | (ch & 0x3f));
5310 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005311 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005312 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005313 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005315 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005316 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005317 if (reason != NULL) {
5318 *reason = "encoding error";
5319 }
5320 if (raw_malloc) {
5321 PyMem_RawFree(bytes);
5322 }
5323 else {
5324 PyMem_Free(bytes);
5325 }
5326 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005327 }
5328 *p++ = (char)(ch & 0xff);
5329 }
5330 else if (ch < 0x10000) {
5331 *p++ = (char)(0xe0 | (ch >> 12));
5332 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5333 *p++ = (char)(0x80 | (ch & 0x3f));
5334 }
5335 else { /* ch >= 0x10000 */
5336 assert(ch <= MAX_UNICODE);
5337 /* Encode UCS4 Unicode ordinals */
5338 *p++ = (char)(0xf0 | (ch >> 18));
5339 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5340 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5341 *p++ = (char)(0x80 | (ch & 0x3f));
5342 }
5343 }
5344 *p++ = '\0';
5345
5346 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005347 char *bytes2;
5348 if (raw_malloc) {
5349 bytes2 = PyMem_RawRealloc(bytes, final_size);
5350 }
5351 else {
5352 bytes2 = PyMem_Realloc(bytes, final_size);
5353 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005354 if (bytes2 == NULL) {
5355 if (error_pos != NULL) {
5356 *error_pos = (size_t)-1;
5357 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005358 if (raw_malloc) {
5359 PyMem_RawFree(bytes);
5360 }
5361 else {
5362 PyMem_Free(bytes);
5363 }
5364 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005365 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005366 *str = bytes2;
5367 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005368}
5369
5370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371/* Primary internal function which creates utf8 encoded bytes objects.
5372
5373 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005374 and allocate exactly as much space needed at the end. Else allocate the
5375 maximum possible needed (4 result bytes per Unicode character), and return
5376 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005377*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005378static PyObject *
5379unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381{
Victor Stinner6099a032011-12-18 14:22:26 +01005382 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 void *data;
5384 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 if (!PyUnicode_Check(unicode)) {
5387 PyErr_BadArgument();
5388 return NULL;
5389 }
5390
5391 if (PyUnicode_READY(unicode) == -1)
5392 return NULL;
5393
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005394 if (PyUnicode_UTF8(unicode))
5395 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5396 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397
5398 kind = PyUnicode_KIND(unicode);
5399 data = PyUnicode_DATA(unicode);
5400 size = PyUnicode_GET_LENGTH(unicode);
5401
Benjamin Petersonead6b532011-12-20 17:23:42 -06005402 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005403 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005404 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005405 case PyUnicode_1BYTE_KIND:
5406 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5407 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005408 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005409 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005410 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005411 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005412 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414}
5415
Alexander Belopolsky40018472011-02-26 01:02:56 +00005416PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005417_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5418{
5419 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5420}
5421
5422
5423PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5425 Py_ssize_t size,
5426 const char *errors)
5427{
5428 PyObject *v, *unicode;
5429
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005430 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 if (unicode == NULL)
5432 return NULL;
5433 v = _PyUnicode_AsUTF8String(unicode, errors);
5434 Py_DECREF(unicode);
5435 return v;
5436}
5437
5438PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005439PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
Walter Dörwald41980ca2007-08-16 21:55:45 +00005444/* --- UTF-32 Codec ------------------------------------------------------- */
5445
5446PyObject *
5447PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 Py_ssize_t size,
5449 const char *errors,
5450 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005451{
5452 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5453}
5454
5455PyObject *
5456PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 Py_ssize_t size,
5458 const char *errors,
5459 int *byteorder,
5460 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005461{
5462 const char *starts = s;
5463 Py_ssize_t startinpos;
5464 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005465 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005466 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005467 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005469 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470 PyObject *errorHandler = NULL;
5471 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005472
Walter Dörwald41980ca2007-08-16 21:55:45 +00005473 q = (unsigned char *)s;
5474 e = q + size;
5475
5476 if (byteorder)
5477 bo = *byteorder;
5478
5479 /* Check for BOM marks (U+FEFF) in the input and adjust current
5480 byte order setting accordingly. In native mode, the leading BOM
5481 mark is skipped, in all other modes, it is copied to the output
5482 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005484 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 if (bom == 0x0000FEFF) {
5486 bo = -1;
5487 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005489 else if (bom == 0xFFFE0000) {
5490 bo = 1;
5491 q += 4;
5492 }
5493 if (byteorder)
5494 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005495 }
5496
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 if (q == e) {
5498 if (consumed)
5499 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005500 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005501 }
5502
Victor Stinnere64322e2012-10-30 23:12:47 +01005503#ifdef WORDS_BIGENDIAN
5504 le = bo < 0;
5505#else
5506 le = bo <= 0;
5507#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005509
Victor Stinner8f674cc2013-04-17 23:02:17 +02005510 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005511 writer.min_length = (e - q + 3) / 4;
5512 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005513 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005514
Victor Stinnere64322e2012-10-30 23:12:47 +01005515 while (1) {
5516 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005518
Victor Stinnere64322e2012-10-30 23:12:47 +01005519 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 enum PyUnicode_Kind kind = writer.kind;
5521 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005522 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005523 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005524 if (le) {
5525 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005526 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005527 if (ch > maxch)
5528 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005529 if (kind != PyUnicode_1BYTE_KIND &&
5530 Py_UNICODE_IS_SURROGATE(ch))
5531 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 q += 4;
5534 } while (q <= last);
5535 }
5536 else {
5537 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005538 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 if (ch > maxch)
5540 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005541 if (kind != PyUnicode_1BYTE_KIND &&
5542 Py_UNICODE_IS_SURROGATE(ch))
5543 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005544 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005545 q += 4;
5546 } while (q <= last);
5547 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005548 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005549 }
5550
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005551 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005552 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 startinpos = ((const char *)q) - starts;
5554 endinpos = startinpos + 4;
5555 }
5556 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005557 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005559 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005561 startinpos = ((const char *)q) - starts;
5562 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005564 else {
5565 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005566 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005567 goto onError;
5568 q += 4;
5569 continue;
5570 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005571 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 startinpos = ((const char *)q) - starts;
5573 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005575
5576 /* The remaining input chars are ignored if the callback
5577 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005578 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005582 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005584 }
5585
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588
Walter Dörwald41980ca2007-08-16 21:55:45 +00005589 Py_XDECREF(errorHandler);
5590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005594 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 Py_XDECREF(errorHandler);
5596 Py_XDECREF(exc);
5597 return NULL;
5598}
5599
5600PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005601_PyUnicode_EncodeUTF32(PyObject *str,
5602 const char *errors,
5603 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005604{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005605 enum PyUnicode_Kind kind;
5606 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005607 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005608 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005609 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005610#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005612#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005613 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005615 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005616 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 PyObject *errorHandler = NULL;
5618 PyObject *exc = NULL;
5619 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005620
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 if (!PyUnicode_Check(str)) {
5622 PyErr_BadArgument();
5623 return NULL;
5624 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005625 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005626 return NULL;
5627 kind = PyUnicode_KIND(str);
5628 data = PyUnicode_DATA(str);
5629 len = PyUnicode_GET_LENGTH(str);
5630
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005631 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005632 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005633 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005634 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005635 if (v == NULL)
5636 return NULL;
5637
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005638 /* output buffer is 4-bytes aligned */
5639 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005640 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005641 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005642 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005645
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 else
5651 encoding = "utf-32";
5652
5653 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005654 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5655 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005656 }
5657
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 pos = 0;
5659 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661
5662 if (kind == PyUnicode_2BYTE_KIND) {
5663 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5664 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005666 else {
5667 assert(kind == PyUnicode_4BYTE_KIND);
5668 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5669 &out, native_ordering);
5670 }
5671 if (pos == len)
5672 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005673
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 rep = unicode_encode_call_errorhandler(
5675 errors, &errorHandler,
5676 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005677 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005678 if (!rep)
5679 goto error;
5680
5681 if (PyBytes_Check(rep)) {
5682 repsize = PyBytes_GET_SIZE(rep);
5683 if (repsize & 3) {
5684 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005685 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005686 "surrogates not allowed");
5687 goto error;
5688 }
5689 moreunits = repsize / 4;
5690 }
5691 else {
5692 assert(PyUnicode_Check(rep));
5693 if (PyUnicode_READY(rep) < 0)
5694 goto error;
5695 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5696 if (!PyUnicode_IS_ASCII(rep)) {
5697 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005699 "surrogates not allowed");
5700 goto error;
5701 }
5702 }
5703
5704 /* four bytes are reserved for each surrogate */
5705 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005706 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005707 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005708 /* integer overflow */
5709 PyErr_NoMemory();
5710 goto error;
5711 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005712 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005714 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 }
5716
5717 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005718 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005719 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005720 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005721 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005722 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5723 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 }
5725
5726 Py_CLEAR(rep);
5727 }
5728
5729 /* Cut back to size actually needed. This is necessary for, for example,
5730 encoding of a string containing isolated surrogates and the 'ignore'
5731 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733 if (nsize != PyBytes_GET_SIZE(v))
5734 _PyBytes_Resize(&v, nsize);
5735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005738 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 error:
5740 Py_XDECREF(rep);
5741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
5743 Py_XDECREF(v);
5744 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5749 Py_ssize_t size,
5750 const char *errors,
5751 int byteorder)
5752{
5753 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005754 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 if (tmp == NULL)
5756 return NULL;
5757 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5758 Py_DECREF(tmp);
5759 return result;
5760}
5761
5762PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005763PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005764{
Victor Stinnerb960b342011-11-20 19:12:52 +01005765 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005766}
5767
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768/* --- UTF-16 Codec ------------------------------------------------------- */
5769
Tim Peters772747b2001-08-09 22:21:55 +00005770PyObject *
5771PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 Py_ssize_t size,
5773 const char *errors,
5774 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775{
Walter Dörwald69652032004-09-07 20:24:22 +00005776 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5777}
5778
5779PyObject *
5780PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 Py_ssize_t size,
5782 const char *errors,
5783 int *byteorder,
5784 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 Py_ssize_t startinpos;
5788 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005791 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005792 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005793 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 PyObject *errorHandler = NULL;
5795 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Tim Peters772747b2001-08-09 22:21:55 +00005798 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005802 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005804 /* Check for BOM marks (U+FEFF) in the input and adjust current
5805 byte order setting accordingly. In native mode, the leading BOM
5806 mark is skipped, in all other modes, it is copied to the output
5807 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005808 if (bo == 0 && size >= 2) {
5809 const Py_UCS4 bom = (q[1] << 8) | q[0];
5810 if (bom == 0xFEFF) {
5811 q += 2;
5812 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005814 else if (bom == 0xFFFE) {
5815 q += 2;
5816 bo = 1;
5817 }
5818 if (byteorder)
5819 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
Antoine Pitrou63065d72012-05-15 23:48:04 +02005822 if (q == e) {
5823 if (consumed)
5824 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005825 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005826 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005827
Christian Heimes743e0cd2012-10-17 23:52:17 +02005828#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005831#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005832 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005834#endif
Tim Peters772747b2001-08-09 22:21:55 +00005835
Antoine Pitrou63065d72012-05-15 23:48:04 +02005836 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005837 character count normally. Error handler will take care of
5838 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005839 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005840 writer.min_length = (e - q + 1) / 2;
5841 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843
Antoine Pitrou63065d72012-05-15 23:48:04 +02005844 while (1) {
5845 Py_UCS4 ch = 0;
5846 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005847 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005848 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005852 native_ordering);
5853 else
5854 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005856 native_ordering);
5857 } else if (kind == PyUnicode_2BYTE_KIND) {
5858 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005860 native_ordering);
5861 } else {
5862 assert(kind == PyUnicode_4BYTE_KIND);
5863 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005865 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005866 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868
Antoine Pitrou63065d72012-05-15 23:48:04 +02005869 switch (ch)
5870 {
5871 case 0:
5872 /* remaining byte at the end? (size should be even) */
5873 if (q == e || consumed)
5874 goto End;
5875 errmsg = "truncated data";
5876 startinpos = ((const char *)q) - starts;
5877 endinpos = ((const char *)e) - starts;
5878 break;
5879 /* The remaining input chars are ignored if the callback
5880 chooses to skip the input */
5881 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005882 q -= 2;
5883 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005884 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005886 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 endinpos = ((const char *)e) - starts;
5888 break;
5889 case 2:
5890 errmsg = "illegal encoding";
5891 startinpos = ((const char *)q) - 2 - starts;
5892 endinpos = startinpos + 2;
5893 break;
5894 case 3:
5895 errmsg = "illegal UTF-16 surrogate";
5896 startinpos = ((const char *)q) - 4 - starts;
5897 endinpos = startinpos + 2;
5898 break;
5899 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005900 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 continue;
5903 }
5904
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005906 errors,
5907 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005908 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005909 &starts,
5910 (const char **)&e,
5911 &startinpos,
5912 &endinpos,
5913 &exc,
5914 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005915 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 }
5918
Antoine Pitrou63065d72012-05-15 23:48:04 +02005919End:
Walter Dörwald69652032004-09-07 20:24:22 +00005920 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 Py_XDECREF(errorHandler);
5924 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005925 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005928 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 Py_XDECREF(errorHandler);
5930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 return NULL;
5932}
5933
Tim Peters772747b2001-08-09 22:21:55 +00005934PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935_PyUnicode_EncodeUTF16(PyObject *str,
5936 const char *errors,
5937 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005939 enum PyUnicode_Kind kind;
5940 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005942 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005943 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005944 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005945#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005946 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005947#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005948 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005949#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 const char *encoding;
5951 Py_ssize_t nsize, pos;
5952 PyObject *errorHandler = NULL;
5953 PyObject *exc = NULL;
5954 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005955
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 if (!PyUnicode_Check(str)) {
5957 PyErr_BadArgument();
5958 return NULL;
5959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005960 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 return NULL;
5962 kind = PyUnicode_KIND(str);
5963 data = PyUnicode_DATA(str);
5964 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005965
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005966 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005967 if (kind == PyUnicode_4BYTE_KIND) {
5968 const Py_UCS4 *in = (const Py_UCS4 *)data;
5969 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005970 while (in < end) {
5971 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005973 }
5974 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005975 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005976 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005978 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005979 nsize = len + pairs + (byteorder == 0);
5980 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005981 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005985 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005986 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005988 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005990 }
5991 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005992 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005993 }
Tim Peters772747b2001-08-09 22:21:55 +00005994
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005995 if (kind == PyUnicode_1BYTE_KIND) {
5996 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5997 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005998 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005999
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006000 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006002 }
6003 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006004 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006005 }
6006 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006007 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006008 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009
6010 pos = 0;
6011 while (pos < len) {
6012 Py_ssize_t repsize, moreunits;
6013
6014 if (kind == PyUnicode_2BYTE_KIND) {
6015 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6016 &out, native_ordering);
6017 }
6018 else {
6019 assert(kind == PyUnicode_4BYTE_KIND);
6020 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6021 &out, native_ordering);
6022 }
6023 if (pos == len)
6024 break;
6025
6026 rep = unicode_encode_call_errorhandler(
6027 errors, &errorHandler,
6028 encoding, "surrogates not allowed",
6029 str, &exc, pos, pos + 1, &pos);
6030 if (!rep)
6031 goto error;
6032
6033 if (PyBytes_Check(rep)) {
6034 repsize = PyBytes_GET_SIZE(rep);
6035 if (repsize & 1) {
6036 raise_encode_exception(&exc, encoding,
6037 str, pos - 1, pos,
6038 "surrogates not allowed");
6039 goto error;
6040 }
6041 moreunits = repsize / 2;
6042 }
6043 else {
6044 assert(PyUnicode_Check(rep));
6045 if (PyUnicode_READY(rep) < 0)
6046 goto error;
6047 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6048 if (!PyUnicode_IS_ASCII(rep)) {
6049 raise_encode_exception(&exc, encoding,
6050 str, pos - 1, pos,
6051 "surrogates not allowed");
6052 goto error;
6053 }
6054 }
6055
6056 /* two bytes are reserved for each surrogate */
6057 if (moreunits > 1) {
6058 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006059 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006060 /* integer overflow */
6061 PyErr_NoMemory();
6062 goto error;
6063 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006064 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006065 goto error;
6066 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6067 }
6068
6069 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006070 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006071 out += moreunits;
6072 } else /* rep is unicode */ {
6073 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075 &out, native_ordering);
6076 }
6077
6078 Py_CLEAR(rep);
6079 }
6080
6081 /* Cut back to size actually needed. This is necessary for, for example,
6082 encoding of a string containing isolated surrogates and the 'ignore' handler
6083 is used. */
6084 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6085 if (nsize != PyBytes_GET_SIZE(v))
6086 _PyBytes_Resize(&v, nsize);
6087 Py_XDECREF(errorHandler);
6088 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006089 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006090 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006091 error:
6092 Py_XDECREF(rep);
6093 Py_XDECREF(errorHandler);
6094 Py_XDECREF(exc);
6095 Py_XDECREF(v);
6096 return NULL;
6097#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
Alexander Belopolsky40018472011-02-26 01:02:56 +00006100PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6102 Py_ssize_t size,
6103 const char *errors,
6104 int byteorder)
6105{
6106 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006107 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108 if (tmp == NULL)
6109 return NULL;
6110 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6111 Py_DECREF(tmp);
6112 return result;
6113}
6114
6115PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006116PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
6121/* --- Unicode Escape Codec ----------------------------------------------- */
6122
Fredrik Lundh06d12682001-01-24 07:59:11 +00006123static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006126_PyUnicode_DecodeUnicodeEscape(const char *s,
6127 Py_ssize_t size,
6128 const char *errors,
6129 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 PyObject *errorHandler = NULL;
6135 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006136
Eric V. Smith42454af2016-10-31 09:22:08 -04006137 // so we can remember if we've seen an invalid escape char or not
6138 *first_invalid_escape = NULL;
6139
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006141 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 }
6143 /* Escaped strings will always be longer than the resulting
6144 Unicode string, so we start with size here and then reduce the
6145 length after conversion to the true value.
6146 (but if the error callback returns a long replacement string
6147 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006148 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 writer.min_length = size;
6150 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6151 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006152 }
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 end = s + size;
6155 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 unsigned char c = (unsigned char) *s++;
6157 Py_UCS4 ch;
6158 int count;
6159 Py_ssize_t startinpos;
6160 Py_ssize_t endinpos;
6161 const char *message;
6162
6163#define WRITE_ASCII_CHAR(ch) \
6164 do { \
6165 assert(ch <= 127); \
6166 assert(writer.pos < writer.size); \
6167 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6168 } while(0)
6169
6170#define WRITE_CHAR(ch) \
6171 do { \
6172 if (ch <= writer.maxchar) { \
6173 assert(writer.pos < writer.size); \
6174 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6175 } \
6176 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6177 goto onError; \
6178 } \
6179 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
6181 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 if (c != '\\') {
6183 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 continue;
6185 }
6186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (s >= end) {
6190 message = "\\ at end of string";
6191 goto error;
6192 }
6193 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006194
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006196 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 case '\n': continue;
6200 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6201 case '\'': WRITE_ASCII_CHAR('\''); continue;
6202 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6203 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006204 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006205 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6206 case 't': WRITE_ASCII_CHAR('\t'); continue;
6207 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6208 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006209 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006211 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 case '0': case '1': case '2': case '3':
6216 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006218 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 ch = (ch<<3) + *s++ - '0';
6220 if (s < end && '0' <= *s && *s <= '7') {
6221 ch = (ch<<3) + *s++ - '0';
6222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 WRITE_CHAR(ch);
6225 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 /* hex escapes */
6228 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006231 message = "truncated \\xXX escape";
6232 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006237 message = "truncated \\uXXXX escape";
6238 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006241 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006243 message = "truncated \\UXXXXXXXX escape";
6244 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006246 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 ch <<= 4;
6248 if (c >= '0' && c <= '9') {
6249 ch += c - '0';
6250 }
6251 else if (c >= 'a' && c <= 'f') {
6252 ch += c - ('a' - 10);
6253 }
6254 else if (c >= 'A' && c <= 'F') {
6255 ch += c - ('A' - 10);
6256 }
6257 else {
6258 break;
6259 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006260 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006262 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 }
6264
6265 /* when we get here, ch is a 32-bit unicode character */
6266 if (ch > MAX_UNICODE) {
6267 message = "illegal Unicode character";
6268 goto error;
6269 }
6270
6271 WRITE_CHAR(ch);
6272 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006275 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006276 if (ucnhash_CAPI == NULL) {
6277 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006278 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6279 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 if (ucnhash_CAPI == NULL) {
6281 PyErr_SetString(
6282 PyExc_UnicodeError,
6283 "\\N escapes not supported (can't load unicodedata module)"
6284 );
6285 goto onError;
6286 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006287 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006288
6289 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006290 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 const char *start = ++s;
6292 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006293 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006295 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 namelen = s - start;
6297 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006298 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006299 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 ch = 0xffffffff; /* in case 'getcode' messes up */
6301 if (namelen <= INT_MAX &&
6302 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6303 &ch, 0)) {
6304 assert(ch <= MAX_UNICODE);
6305 WRITE_CHAR(ch);
6306 continue;
6307 }
6308 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006309 }
6310 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006311 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006312
6313 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006314 if (*first_invalid_escape == NULL) {
6315 *first_invalid_escape = s-1; /* Back up one char, since we've
6316 already incremented s. */
6317 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 WRITE_ASCII_CHAR('\\');
6319 WRITE_CHAR(c);
6320 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006322
6323 error:
6324 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006326 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006327 errors, &errorHandler,
6328 "unicodeescape", message,
6329 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006331 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006333 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006334
6335#undef WRITE_ASCII_CHAR
6336#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006338
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006339 Py_XDECREF(errorHandler);
6340 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006341 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 return NULL;
6348}
6349
Eric V. Smith42454af2016-10-31 09:22:08 -04006350PyObject *
6351PyUnicode_DecodeUnicodeEscape(const char *s,
6352 Py_ssize_t size,
6353 const char *errors)
6354{
6355 const char *first_invalid_escape;
6356 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6357 &first_invalid_escape);
6358 if (result == NULL)
6359 return NULL;
6360 if (first_invalid_escape != NULL) {
6361 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6362 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006363 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006364 Py_DECREF(result);
6365 return NULL;
6366 }
6367 }
6368 return result;
6369}
6370
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006371/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006374PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006380 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382
Ezio Melottie7f90372012-10-05 03:33:31 +03006383 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006384 escape.
6385
Ezio Melottie7f90372012-10-05 03:33:31 +03006386 For UCS1 strings it's '\xxx', 4 bytes per source character.
6387 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6388 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006389 */
6390
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391 if (!PyUnicode_Check(unicode)) {
6392 PyErr_BadArgument();
6393 return NULL;
6394 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 }
Victor Stinner358af132015-10-12 22:36:57 +02006398
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006399 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 if (len == 0) {
6401 return PyBytes_FromStringAndSize(NULL, 0);
6402 }
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404 kind = PyUnicode_KIND(unicode);
6405 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6407 bytes, and 1 byte characters 4. */
6408 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006409 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 return PyErr_NoMemory();
6411 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006412 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 if (repr == NULL) {
6414 return NULL;
6415 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006419 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006420
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 /* U+0000-U+00ff range */
6422 if (ch < 0x100) {
6423 if (ch >= ' ' && ch < 127) {
6424 if (ch != '\\') {
6425 /* Copy printable US ASCII as-is */
6426 *p++ = (char) ch;
6427 }
6428 /* Escape backslashes */
6429 else {
6430 *p++ = '\\';
6431 *p++ = '\\';
6432 }
6433 }
Victor Stinner358af132015-10-12 22:36:57 +02006434
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 /* Map special whitespace to '\t', \n', '\r' */
6436 else if (ch == '\t') {
6437 *p++ = '\\';
6438 *p++ = 't';
6439 }
6440 else if (ch == '\n') {
6441 *p++ = '\\';
6442 *p++ = 'n';
6443 }
6444 else if (ch == '\r') {
6445 *p++ = '\\';
6446 *p++ = 'r';
6447 }
6448
6449 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6450 else {
6451 *p++ = '\\';
6452 *p++ = 'x';
6453 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6454 *p++ = Py_hexdigits[ch & 0x000F];
6455 }
Tim Petersced69f82003-09-16 20:30:58 +00006456 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006457 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 *p++ = '\\';
6460 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006461 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6464 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6467 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006468
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 /* Make sure that the first two digits are zero */
6470 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006471 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 *p++ = 'U';
6473 *p++ = '0';
6474 *p++ = '0';
6475 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6476 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6477 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6478 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6479 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6480 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 assert(p - PyBytes_AS_STRING(repr) > 0);
6485 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6486 return NULL;
6487 }
6488 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6493 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006495 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006496 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 }
6500
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006501 result = PyUnicode_AsUnicodeEscapeString(tmp);
6502 Py_DECREF(tmp);
6503 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
6506/* --- Raw Unicode Escape Codec ------------------------------------------- */
6507
Alexander Belopolsky40018472011-02-26 01:02:56 +00006508PyObject *
6509PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006510 Py_ssize_t size,
6511 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006514 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 PyObject *errorHandler = NULL;
6517 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006518
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006520 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006522
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 /* Escaped strings will always be longer than the resulting
6524 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006525 length after conversion to the true value. (But decoding error
6526 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006527 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006528 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6530 goto onError;
6531 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 end = s + size;
6534 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 unsigned char c = (unsigned char) *s++;
6536 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006537 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006538 Py_ssize_t startinpos;
6539 Py_ssize_t endinpos;
6540 const char *message;
6541
6542#define WRITE_CHAR(ch) \
6543 do { \
6544 if (ch <= writer.maxchar) { \
6545 assert(writer.pos < writer.size); \
6546 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6547 } \
6548 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6549 goto onError; \
6550 } \
6551 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 if (c != '\\' || s >= end) {
6555 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558
Victor Stinner62ec3312016-09-06 17:04:34 -07006559 c = (unsigned char) *s++;
6560 if (c == 'u') {
6561 count = 4;
6562 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006564 else if (c == 'U') {
6565 count = 8;
6566 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006567 }
6568 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 assert(writer.pos < writer.size);
6570 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6571 WRITE_CHAR(c);
6572 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006573 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006574 startinpos = s - starts - 2;
6575
6576 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6577 for (ch = 0; count && s < end; ++s, --count) {
6578 c = (unsigned char)*s;
6579 ch <<= 4;
6580 if (c >= '0' && c <= '9') {
6581 ch += c - '0';
6582 }
6583 else if (c >= 'a' && c <= 'f') {
6584 ch += c - ('a' - 10);
6585 }
6586 else if (c >= 'A' && c <= 'F') {
6587 ch += c - ('A' - 10);
6588 }
6589 else {
6590 break;
6591 }
6592 }
6593 if (!count) {
6594 if (ch <= MAX_UNICODE) {
6595 WRITE_CHAR(ch);
6596 continue;
6597 }
6598 message = "\\Uxxxxxxxx out of range";
6599 }
6600
6601 endinpos = s-starts;
6602 writer.min_length = end - s + writer.pos;
6603 if (unicode_decode_call_errorhandler_writer(
6604 errors, &errorHandler,
6605 "rawunicodeescape", message,
6606 &starts, &end, &startinpos, &endinpos, &exc, &s,
6607 &writer)) {
6608 goto onError;
6609 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006610 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006611
6612#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 Py_XDECREF(errorHandler);
6615 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006616 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006617
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006619 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 Py_XDECREF(errorHandler);
6621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633 int kind;
6634 void *data;
6635 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006637 if (!PyUnicode_Check(unicode)) {
6638 PyErr_BadArgument();
6639 return NULL;
6640 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006642 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 kind = PyUnicode_KIND(unicode);
6645 data = PyUnicode_DATA(unicode);
6646 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 if (kind == PyUnicode_1BYTE_KIND) {
6648 return PyBytes_FromStringAndSize(data, len);
6649 }
Victor Stinner0e368262011-11-10 20:12:49 +01006650
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6652 bytes, and 1 byte characters 4. */
6653 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006654
Victor Stinner62ec3312016-09-06 17:04:34 -07006655 if (len > PY_SSIZE_T_MAX / expandsize) {
6656 return PyErr_NoMemory();
6657 }
6658 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6659 if (repr == NULL) {
6660 return NULL;
6661 }
6662 if (len == 0) {
6663 return repr;
6664 }
6665
6666 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006667 for (pos = 0; pos < len; pos++) {
6668 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006669
Victor Stinner62ec3312016-09-06 17:04:34 -07006670 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6671 if (ch < 0x100) {
6672 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006673 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006674 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006675 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 *p++ = '\\';
6677 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006678 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6679 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6680 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6681 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6684 else {
6685 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6686 *p++ = '\\';
6687 *p++ = 'U';
6688 *p++ = '0';
6689 *p++ = '0';
6690 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6691 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6692 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6693 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6694 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6695 *p++ = Py_hexdigits[ch & 15];
6696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006698
Victor Stinner62ec3312016-09-06 17:04:34 -07006699 assert(p > PyBytes_AS_STRING(repr));
6700 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6701 return NULL;
6702 }
6703 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006707PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6708 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006710 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006711 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006712 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006713 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6715 Py_DECREF(tmp);
6716 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
6719/* --- Latin-1 Codec ------------------------------------------------------ */
6720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721PyObject *
6722PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006723 Py_ssize_t size,
6724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006727 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731static void
6732make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006733 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006734 PyObject *unicode,
6735 Py_ssize_t startpos, Py_ssize_t endpos,
6736 const char *reason)
6737{
6738 if (*exceptionObject == NULL) {
6739 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006741 encoding, unicode, startpos, endpos, reason);
6742 }
6743 else {
6744 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6745 goto onError;
6746 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6747 goto onError;
6748 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6749 goto onError;
6750 return;
6751 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006752 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006753 }
6754}
6755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757static void
6758raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006759 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006760 PyObject *unicode,
6761 Py_ssize_t startpos, Py_ssize_t endpos,
6762 const char *reason)
6763{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006764 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006765 encoding, unicode, startpos, endpos, reason);
6766 if (*exceptionObject != NULL)
6767 PyCodec_StrictErrors(*exceptionObject);
6768}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769
6770/* error handling callback helper:
6771 build arguments, call the callback and check the arguments,
6772 put the result into newpos and return the replacement string, which
6773 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006774static PyObject *
6775unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006776 PyObject **errorHandler,
6777 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006779 Py_ssize_t startpos, Py_ssize_t endpos,
6780 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006782 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 PyObject *restuple;
6785 PyObject *resunicode;
6786
6787 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 }
6792
Benjamin Petersonbac79492012-01-14 13:34:47 -05006793 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 return NULL;
6795 len = PyUnicode_GET_LENGTH(unicode);
6796
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006797 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006802 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006806 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 Py_DECREF(restuple);
6808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006810 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 &resunicode, newpos)) {
6812 Py_DECREF(restuple);
6813 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006814 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006815 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6816 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6817 Py_DECREF(restuple);
6818 return NULL;
6819 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 *newpos = len + *newpos;
6822 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006823 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 Py_DECREF(restuple);
6825 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006827 Py_INCREF(resunicode);
6828 Py_DECREF(restuple);
6829 return resunicode;
6830}
6831
Alexander Belopolsky40018472011-02-26 01:02:56 +00006832static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006833unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006834 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006835 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006836{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006837 /* input state */
6838 Py_ssize_t pos=0, size;
6839 int kind;
6840 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006841 /* pointer into the output */
6842 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006843 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6844 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006845 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006847 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006849 /* output object */
6850 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851
Benjamin Petersonbac79492012-01-14 13:34:47 -05006852 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006853 return NULL;
6854 size = PyUnicode_GET_LENGTH(unicode);
6855 kind = PyUnicode_KIND(unicode);
6856 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 /* allocate enough for a simple encoding without
6858 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006859 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006860 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006861
6862 _PyBytesWriter_Init(&writer);
6863 str = _PyBytesWriter_Alloc(&writer, size);
6864 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006865 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006867 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006868 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006871 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006873 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006875 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006877 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006879 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006880 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006882
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006883 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006885
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006886 /* Only overallocate the buffer if it's not the last write */
6887 writer.overallocate = (collend < size);
6888
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006890 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006891 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006892
6893 switch (error_handler) {
6894 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006895 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006897
6898 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006899 memset(str, '?', collend - collstart);
6900 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006901 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006902 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006903 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 break;
Victor Stinner50149202015-09-22 00:26:54 +02006905
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006906 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006907 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006908 writer.min_size -= (collend - collstart);
6909 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006910 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006911 if (str == NULL)
6912 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006913 pos = collend;
6914 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006915
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006916 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006917 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006918 writer.min_size -= (collend - collstart);
6919 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006920 unicode, collstart, collend);
6921 if (str == NULL)
6922 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006923 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 break;
Victor Stinner50149202015-09-22 00:26:54 +02006925
Victor Stinnerc3713e92015-09-29 12:32:13 +02006926 case _Py_ERROR_SURROGATEESCAPE:
6927 for (i = collstart; i < collend; ++i) {
6928 ch = PyUnicode_READ(kind, data, i);
6929 if (ch < 0xdc80 || 0xdcff < ch) {
6930 /* Not a UTF-8b surrogate */
6931 break;
6932 }
6933 *str++ = (char)(ch - 0xdc00);
6934 ++pos;
6935 }
6936 if (i >= collend)
6937 break;
6938 collstart = pos;
6939 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006940 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006941
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006943 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6944 encoding, reason, unicode, &exc,
6945 collstart, collend, &newpos);
6946 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006948
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006949 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006950 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006951
Victor Stinner6bd525b2015-10-09 13:10:05 +02006952 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006953 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006954 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006955 PyBytes_AS_STRING(rep),
6956 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006957 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006958 else {
6959 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006960
Victor Stinner6bd525b2015-10-09 13:10:05 +02006961 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006963
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006964 if (limit == 256 ?
6965 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6966 !PyUnicode_IS_ASCII(rep))
6967 {
6968 /* Not all characters are smaller than limit */
6969 raise_encode_exception(&exc, encoding, unicode,
6970 collstart, collend, reason);
6971 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006973 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6974 str = _PyBytesWriter_WriteBytes(&writer, str,
6975 PyUnicode_DATA(rep),
6976 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006978 if (str == NULL)
6979 goto onError;
6980
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006981 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006982 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006983 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006984
6985 /* If overallocation was disabled, ensure that it was the last
6986 write. Otherwise, we missed an optimization */
6987 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988 }
6989 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006990
Victor Stinner50149202015-09-22 00:26:54 +02006991 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006992 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006993 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006994
6995 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006996 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006997 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006998 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006999 Py_XDECREF(exc);
7000 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001}
7002
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007003/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007004PyObject *
7005PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007006 Py_ssize_t size,
7007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007009 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007010 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011 if (unicode == NULL)
7012 return NULL;
7013 result = unicode_encode_ucs1(unicode, errors, 256);
7014 Py_DECREF(unicode);
7015 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Alexander Belopolsky40018472011-02-26 01:02:56 +00007018PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020{
7021 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 PyErr_BadArgument();
7023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007025 if (PyUnicode_READY(unicode) == -1)
7026 return NULL;
7027 /* Fast path: if it is a one-byte string, construct
7028 bytes object directly. */
7029 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7030 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7031 PyUnicode_GET_LENGTH(unicode));
7032 /* Non-Latin-1 characters present. Defer to above function to
7033 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007035}
7036
7037PyObject*
7038PyUnicode_AsLatin1String(PyObject *unicode)
7039{
7040 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
7043/* --- 7-bit ASCII Codec -------------------------------------------------- */
7044
Alexander Belopolsky40018472011-02-26 01:02:56 +00007045PyObject *
7046PyUnicode_DecodeASCII(const char *s,
7047 Py_ssize_t size,
7048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007050 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007051 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007052 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007054 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007055
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007057 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007058
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007060 if (size == 1 && (unsigned char)s[0] < 128)
7061 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007062
Inada Naoki770847a2019-06-24 12:30:24 +09007063 // Shortcut for simple case
7064 PyObject *u = PyUnicode_New(size, 127);
7065 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007066 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007067 }
7068 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7069 if (outpos == size) {
7070 return u;
7071 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007072
Inada Naoki770847a2019-06-24 12:30:24 +09007073 _PyUnicodeWriter writer;
7074 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007075 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007076
Inada Naoki770847a2019-06-24 12:30:24 +09007077 s += outpos;
7078 int kind = writer.kind;
7079 void *data = writer.data;
7080 Py_ssize_t startinpos, endinpos;
7081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007082 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007083 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007085 PyUnicode_WRITE(kind, data, writer.pos, c);
7086 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007088 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007090
7091 /* byte outsize range 0x00..0x7f: call the error handler */
7092
7093 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007094 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007095
7096 switch (error_handler)
7097 {
7098 case _Py_ERROR_REPLACE:
7099 case _Py_ERROR_SURROGATEESCAPE:
7100 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007101 but we may switch to UCS2 at the first write */
7102 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7103 goto onError;
7104 kind = writer.kind;
7105 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007106
7107 if (error_handler == _Py_ERROR_REPLACE)
7108 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7109 else
7110 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7111 writer.pos++;
7112 ++s;
7113 break;
7114
7115 case _Py_ERROR_IGNORE:
7116 ++s;
7117 break;
7118
7119 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 startinpos = s-starts;
7121 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007122 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007123 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 "ascii", "ordinal not in range(128)",
7125 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007126 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007128 kind = writer.kind;
7129 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007132 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007134 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007135
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007137 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007138 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
7141}
7142
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007143/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144PyObject *
7145PyUnicode_EncodeASCII(const Py_UNICODE *p,
7146 Py_ssize_t size,
7147 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007149 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007150 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007151 if (unicode == NULL)
7152 return NULL;
7153 result = unicode_encode_ucs1(unicode, errors, 128);
7154 Py_DECREF(unicode);
7155 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156}
7157
Alexander Belopolsky40018472011-02-26 01:02:56 +00007158PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007159_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
7161 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 PyErr_BadArgument();
7163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007165 if (PyUnicode_READY(unicode) == -1)
7166 return NULL;
7167 /* Fast path: if it is an ASCII-only string, construct bytes object
7168 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007169 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007170 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7171 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007172 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007173}
7174
7175PyObject *
7176PyUnicode_AsASCIIString(PyObject *unicode)
7177{
7178 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179}
7180
Steve Dowercc16be82016-09-08 10:35:16 -07007181#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007182
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007183/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007184
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007185#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186#define NEED_RETRY
7187#endif
7188
Steve Dower7ebdda02019-08-21 16:22:33 -07007189/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7190 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7191 both cases also and avoids partial characters overrunning the
7192 length limit in MultiByteToWideChar on Windows */
7193#define DECODING_CHUNK_SIZE (INT_MAX/4)
7194
Victor Stinner3a50e702011-10-18 21:21:00 +02007195#ifndef WC_ERR_INVALID_CHARS
7196# define WC_ERR_INVALID_CHARS 0x0080
7197#endif
7198
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007199static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007200code_page_name(UINT code_page, PyObject **obj)
7201{
7202 *obj = NULL;
7203 if (code_page == CP_ACP)
7204 return "mbcs";
7205 if (code_page == CP_UTF7)
7206 return "CP_UTF7";
7207 if (code_page == CP_UTF8)
7208 return "CP_UTF8";
7209
7210 *obj = PyBytes_FromFormat("cp%u", code_page);
7211 if (*obj == NULL)
7212 return NULL;
7213 return PyBytes_AS_STRING(*obj);
7214}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215
Victor Stinner3a50e702011-10-18 21:21:00 +02007216static DWORD
7217decode_code_page_flags(UINT code_page)
7218{
7219 if (code_page == CP_UTF7) {
7220 /* The CP_UTF7 decoder only supports flags=0 */
7221 return 0;
7222 }
7223 else
7224 return MB_ERR_INVALID_CHARS;
7225}
7226
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 * Decode a byte string from a Windows code page into unicode object in strict
7229 * mode.
7230 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007231 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7232 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007234static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007235decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007236 wchar_t **buf,
7237 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 const char *in,
7239 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007241 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007242 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007244
7245 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007247 while ((outsize = MultiByteToWideChar(code_page, flags,
7248 in, insize, NULL, 0)) <= 0)
7249 {
7250 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7251 goto error;
7252 }
7253 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7254 flags = 0;
7255 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007257 /* Extend a wchar_t* buffer */
7258 Py_ssize_t n = *bufsize; /* Get the current length */
7259 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7260 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007262 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263
7264 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7266 if (outsize <= 0)
7267 goto error;
7268 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007269
Victor Stinner3a50e702011-10-18 21:21:00 +02007270error:
7271 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7272 return -2;
7273 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007274 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275}
7276
Victor Stinner3a50e702011-10-18 21:21:00 +02007277/*
7278 * Decode a byte string from a code page into unicode object with an error
7279 * handler.
7280 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007281 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 * UnicodeDecodeError exception and returns -1 on error.
7283 */
7284static int
7285decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007286 wchar_t **buf,
7287 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007288 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007289 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007290{
7291 const char *startin = in;
7292 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007293 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 /* Ideally, we should get reason from FormatMessage. This is the Windows
7295 2000 English version of the message. */
7296 const char *reason = "No mapping for the Unicode character exists "
7297 "in the target code page.";
7298 /* each step cannot decode more than 1 character, but a character can be
7299 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007300 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007301 int insize;
7302 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 PyObject *errorHandler = NULL;
7304 PyObject *exc = NULL;
7305 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007306 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 DWORD err;
7308 int ret = -1;
7309
7310 assert(size > 0);
7311
7312 encoding = code_page_name(code_page, &encoding_obj);
7313 if (encoding == NULL)
7314 return -1;
7315
Victor Stinner7d00cc12014-03-17 23:08:06 +01007316 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7318 UnicodeDecodeError. */
7319 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7320 if (exc != NULL) {
7321 PyCodec_StrictErrors(exc);
7322 Py_CLEAR(exc);
7323 }
7324 goto error;
7325 }
7326
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007327 /* Extend a wchar_t* buffer */
7328 Py_ssize_t n = *bufsize; /* Get the current length */
7329 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7330 PyErr_NoMemory();
7331 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007333 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7334 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007336 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007337
7338 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 while (in < endin)
7340 {
7341 /* Decode a character */
7342 insize = 1;
7343 do
7344 {
7345 outsize = MultiByteToWideChar(code_page, flags,
7346 in, insize,
7347 buffer, Py_ARRAY_LENGTH(buffer));
7348 if (outsize > 0)
7349 break;
7350 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007351 if (err == ERROR_INVALID_FLAGS && flags) {
7352 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7353 flags = 0;
7354 continue;
7355 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 if (err != ERROR_NO_UNICODE_TRANSLATION
7357 && err != ERROR_INSUFFICIENT_BUFFER)
7358 {
7359 PyErr_SetFromWindowsErr(0);
7360 goto error;
7361 }
7362 insize++;
7363 }
7364 /* 4=maximum length of a UTF-8 sequence */
7365 while (insize <= 4 && (in + insize) <= endin);
7366
7367 if (outsize <= 0) {
7368 Py_ssize_t startinpos, endinpos, outpos;
7369
Victor Stinner7d00cc12014-03-17 23:08:06 +01007370 /* last character in partial decode? */
7371 if (in + insize >= endin && !final)
7372 break;
7373
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 startinpos = in - startin;
7375 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007376 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007377 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 errors, &errorHandler,
7379 encoding, reason,
7380 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 {
7383 goto error;
7384 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007385 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 }
7387 else {
7388 in += insize;
7389 memcpy(out, buffer, outsize * sizeof(wchar_t));
7390 out += outsize;
7391 }
7392 }
7393
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007394 /* Shrink the buffer */
7395 assert(out - *buf <= *bufsize);
7396 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007397 /* (in - startin) <= size and size is an int */
7398 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007399
7400error:
7401 Py_XDECREF(encoding_obj);
7402 Py_XDECREF(errorHandler);
7403 Py_XDECREF(exc);
7404 return ret;
7405}
7406
Victor Stinner3a50e702011-10-18 21:21:00 +02007407static PyObject *
7408decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007409 const char *s, Py_ssize_t size,
7410 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007412 wchar_t *buf = NULL;
7413 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007414 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415
Victor Stinner3a50e702011-10-18 21:21:00 +02007416 if (code_page < 0) {
7417 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7418 return NULL;
7419 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007420 if (size < 0) {
7421 PyErr_BadInternalCall();
7422 return NULL;
7423 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007424
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427
Victor Stinner76a31a62011-11-04 00:05:13 +01007428 do
7429 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007430#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007431 if (size > DECODING_CHUNK_SIZE) {
7432 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007433 final = 0;
7434 done = 0;
7435 }
7436 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007437#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007438 {
7439 chunk_size = (int)size;
7440 final = (consumed == NULL);
7441 done = 1;
7442 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007443
Victor Stinner76a31a62011-11-04 00:05:13 +01007444 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007445 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007446 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007447 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007448 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007450 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007451 s, chunk_size);
7452 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007453 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007454 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007455 errors, final);
7456 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007457
7458 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007459 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007460 return NULL;
7461 }
7462
7463 if (consumed)
7464 *consumed += converted;
7465
7466 s += converted;
7467 size -= converted;
7468 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007469
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007470 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7471 PyMem_Free(buf);
7472 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473}
7474
Alexander Belopolsky40018472011-02-26 01:02:56 +00007475PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007476PyUnicode_DecodeCodePageStateful(int code_page,
7477 const char *s,
7478 Py_ssize_t size,
7479 const char *errors,
7480 Py_ssize_t *consumed)
7481{
7482 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7483}
7484
7485PyObject *
7486PyUnicode_DecodeMBCSStateful(const char *s,
7487 Py_ssize_t size,
7488 const char *errors,
7489 Py_ssize_t *consumed)
7490{
7491 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7492}
7493
7494PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007495PyUnicode_DecodeMBCS(const char *s,
7496 Py_ssize_t size,
7497 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007498{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007499 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7500}
7501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502static DWORD
7503encode_code_page_flags(UINT code_page, const char *errors)
7504{
7505 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007506 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 }
7508 else if (code_page == CP_UTF7) {
7509 /* CP_UTF7 only supports flags=0 */
7510 return 0;
7511 }
7512 else {
7513 if (errors != NULL && strcmp(errors, "replace") == 0)
7514 return 0;
7515 else
7516 return WC_NO_BEST_FIT_CHARS;
7517 }
7518}
7519
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 * Encode a Unicode string to a Windows code page into a byte string in strict
7522 * mode.
7523 *
7524 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007525 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007527static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007528encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531{
Victor Stinner554f3f02010-06-16 23:33:54 +00007532 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 BOOL *pusedDefaultChar = &usedDefaultChar;
7534 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007535 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007536 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 const DWORD flags = encode_code_page_flags(code_page, NULL);
7538 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007539 /* Create a substring so that we can get the UTF-16 representation
7540 of just the slice under consideration. */
7541 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542
Martin v. Löwis3d325192011-11-04 18:23:06 +01007543 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007544
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007546 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007548 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007549
Victor Stinner2fc507f2011-11-04 20:06:39 +01007550 substring = PyUnicode_Substring(unicode, offset, offset+len);
7551 if (substring == NULL)
7552 return -1;
7553 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7554 if (p == NULL) {
7555 Py_DECREF(substring);
7556 return -1;
7557 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007558 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007560 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007562 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 NULL, 0,
7564 NULL, pusedDefaultChar);
7565 if (outsize <= 0)
7566 goto error;
7567 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007568 if (pusedDefaultChar && *pusedDefaultChar) {
7569 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007571 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007572
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007576 if (*outbytes == NULL) {
7577 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007579 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581 }
7582 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 const Py_ssize_t n = PyBytes_Size(*outbytes);
7585 if (outsize > PY_SSIZE_T_MAX - n) {
7586 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007587 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7591 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007592 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007593 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007595 }
7596
7597 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007599 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 out, outsize,
7601 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007602 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 if (outsize <= 0)
7604 goto error;
7605 if (pusedDefaultChar && *pusedDefaultChar)
7606 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007607 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007608
Victor Stinner3a50e702011-10-18 21:21:00 +02007609error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007610 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7612 return -2;
7613 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007614 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007615}
7616
Victor Stinner3a50e702011-10-18 21:21:00 +02007617/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007618 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007619 * error handler.
7620 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007621 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 * -1 on other error.
7623 */
7624static int
7625encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007626 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007627 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007628{
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007630 Py_ssize_t pos = unicode_offset;
7631 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 /* Ideally, we should get reason from FormatMessage. This is the Windows
7633 2000 English version of the message. */
7634 const char *reason = "invalid character";
7635 /* 4=maximum length of a UTF-8 sequence */
7636 char buffer[4];
7637 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7638 Py_ssize_t outsize;
7639 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 PyObject *errorHandler = NULL;
7641 PyObject *exc = NULL;
7642 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007643 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007644 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 PyObject *rep;
7646 int ret = -1;
7647
7648 assert(insize > 0);
7649
7650 encoding = code_page_name(code_page, &encoding_obj);
7651 if (encoding == NULL)
7652 return -1;
7653
7654 if (errors == NULL || strcmp(errors, "strict") == 0) {
7655 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7656 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007657 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 if (exc != NULL) {
7659 PyCodec_StrictErrors(exc);
7660 Py_DECREF(exc);
7661 }
7662 Py_XDECREF(encoding_obj);
7663 return -1;
7664 }
7665
7666 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7667 pusedDefaultChar = &usedDefaultChar;
7668 else
7669 pusedDefaultChar = NULL;
7670
7671 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7672 PyErr_NoMemory();
7673 goto error;
7674 }
7675 outsize = insize * Py_ARRAY_LENGTH(buffer);
7676
7677 if (*outbytes == NULL) {
7678 /* Create string object */
7679 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7680 if (*outbytes == NULL)
7681 goto error;
7682 out = PyBytes_AS_STRING(*outbytes);
7683 }
7684 else {
7685 /* Extend string object */
7686 Py_ssize_t n = PyBytes_Size(*outbytes);
7687 if (n > PY_SSIZE_T_MAX - outsize) {
7688 PyErr_NoMemory();
7689 goto error;
7690 }
7691 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7692 goto error;
7693 out = PyBytes_AS_STRING(*outbytes) + n;
7694 }
7695
7696 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007697 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007698 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007699 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7700 wchar_t chars[2];
7701 int charsize;
7702 if (ch < 0x10000) {
7703 chars[0] = (wchar_t)ch;
7704 charsize = 1;
7705 }
7706 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007707 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7708 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007709 charsize = 2;
7710 }
7711
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007713 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 buffer, Py_ARRAY_LENGTH(buffer),
7715 NULL, pusedDefaultChar);
7716 if (outsize > 0) {
7717 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7718 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007719 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007720 memcpy(out, buffer, outsize);
7721 out += outsize;
7722 continue;
7723 }
7724 }
7725 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7726 PyErr_SetFromWindowsErr(0);
7727 goto error;
7728 }
7729
Victor Stinner3a50e702011-10-18 21:21:00 +02007730 rep = unicode_encode_call_errorhandler(
7731 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007732 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 if (rep == NULL)
7735 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007737
7738 if (PyBytes_Check(rep)) {
7739 outsize = PyBytes_GET_SIZE(rep);
7740 if (outsize != 1) {
7741 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7742 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7743 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7744 Py_DECREF(rep);
7745 goto error;
7746 }
7747 out = PyBytes_AS_STRING(*outbytes) + offset;
7748 }
7749 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7750 out += outsize;
7751 }
7752 else {
7753 Py_ssize_t i;
7754 enum PyUnicode_Kind kind;
7755 void *data;
7756
Benjamin Petersonbac79492012-01-14 13:34:47 -05007757 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007758 Py_DECREF(rep);
7759 goto error;
7760 }
7761
7762 outsize = PyUnicode_GET_LENGTH(rep);
7763 if (outsize != 1) {
7764 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7765 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7766 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7767 Py_DECREF(rep);
7768 goto error;
7769 }
7770 out = PyBytes_AS_STRING(*outbytes) + offset;
7771 }
7772 kind = PyUnicode_KIND(rep);
7773 data = PyUnicode_DATA(rep);
7774 for (i=0; i < outsize; i++) {
7775 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7776 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007777 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007778 encoding, unicode,
7779 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007780 "unable to encode error handler result to ASCII");
7781 Py_DECREF(rep);
7782 goto error;
7783 }
7784 *out = (unsigned char)ch;
7785 out++;
7786 }
7787 }
7788 Py_DECREF(rep);
7789 }
7790 /* write a NUL byte */
7791 *out = 0;
7792 outsize = out - PyBytes_AS_STRING(*outbytes);
7793 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7794 if (_PyBytes_Resize(outbytes, outsize) < 0)
7795 goto error;
7796 ret = 0;
7797
7798error:
7799 Py_XDECREF(encoding_obj);
7800 Py_XDECREF(errorHandler);
7801 Py_XDECREF(exc);
7802 return ret;
7803}
7804
Victor Stinner3a50e702011-10-18 21:21:00 +02007805static PyObject *
7806encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007807 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007808 const char *errors)
7809{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007810 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007811 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007812 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007813 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007814
Victor Stinner29dacf22015-01-26 16:41:32 +01007815 if (!PyUnicode_Check(unicode)) {
7816 PyErr_BadArgument();
7817 return NULL;
7818 }
7819
Benjamin Petersonbac79492012-01-14 13:34:47 -05007820 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007821 return NULL;
7822 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007823
Victor Stinner3a50e702011-10-18 21:21:00 +02007824 if (code_page < 0) {
7825 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7826 return NULL;
7827 }
7828
Martin v. Löwis3d325192011-11-04 18:23:06 +01007829 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007830 return PyBytes_FromStringAndSize(NULL, 0);
7831
Victor Stinner7581cef2011-11-03 22:32:33 +01007832 offset = 0;
7833 do
7834 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007835#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007836 if (len > DECODING_CHUNK_SIZE) {
7837 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007838 done = 0;
7839 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007840 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007841#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007842 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007843 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007844 done = 1;
7845 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007846
Victor Stinner76a31a62011-11-04 00:05:13 +01007847 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007848 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007849 errors);
7850 if (ret == -2)
7851 ret = encode_code_page_errors(code_page, &outbytes,
7852 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007853 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007854 if (ret < 0) {
7855 Py_XDECREF(outbytes);
7856 return NULL;
7857 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007858
Victor Stinner7581cef2011-11-03 22:32:33 +01007859 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007860 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007861 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007862
Victor Stinner3a50e702011-10-18 21:21:00 +02007863 return outbytes;
7864}
7865
7866PyObject *
7867PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7868 Py_ssize_t size,
7869 const char *errors)
7870{
Victor Stinner7581cef2011-11-03 22:32:33 +01007871 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007872 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007873 if (unicode == NULL)
7874 return NULL;
7875 res = encode_code_page(CP_ACP, unicode, errors);
7876 Py_DECREF(unicode);
7877 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007878}
7879
7880PyObject *
7881PyUnicode_EncodeCodePage(int code_page,
7882 PyObject *unicode,
7883 const char *errors)
7884{
Victor Stinner7581cef2011-11-03 22:32:33 +01007885 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007886}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007887
Alexander Belopolsky40018472011-02-26 01:02:56 +00007888PyObject *
7889PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007890{
Victor Stinner7581cef2011-11-03 22:32:33 +01007891 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007892}
7893
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007894#undef NEED_RETRY
7895
Steve Dowercc16be82016-09-08 10:35:16 -07007896#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007897
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898/* --- Character Mapping Codec -------------------------------------------- */
7899
Victor Stinnerfb161b12013-04-18 01:44:27 +02007900static int
7901charmap_decode_string(const char *s,
7902 Py_ssize_t size,
7903 PyObject *mapping,
7904 const char *errors,
7905 _PyUnicodeWriter *writer)
7906{
7907 const char *starts = s;
7908 const char *e;
7909 Py_ssize_t startinpos, endinpos;
7910 PyObject *errorHandler = NULL, *exc = NULL;
7911 Py_ssize_t maplen;
7912 enum PyUnicode_Kind mapkind;
7913 void *mapdata;
7914 Py_UCS4 x;
7915 unsigned char ch;
7916
7917 if (PyUnicode_READY(mapping) == -1)
7918 return -1;
7919
7920 maplen = PyUnicode_GET_LENGTH(mapping);
7921 mapdata = PyUnicode_DATA(mapping);
7922 mapkind = PyUnicode_KIND(mapping);
7923
7924 e = s + size;
7925
7926 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7927 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7928 * is disabled in encoding aliases, latin1 is preferred because
7929 * its implementation is faster. */
7930 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7931 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7932 Py_UCS4 maxchar = writer->maxchar;
7933
7934 assert (writer->kind == PyUnicode_1BYTE_KIND);
7935 while (s < e) {
7936 ch = *s;
7937 x = mapdata_ucs1[ch];
7938 if (x > maxchar) {
7939 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7940 goto onError;
7941 maxchar = writer->maxchar;
7942 outdata = (Py_UCS1 *)writer->data;
7943 }
7944 outdata[writer->pos] = x;
7945 writer->pos++;
7946 ++s;
7947 }
7948 return 0;
7949 }
7950
7951 while (s < e) {
7952 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7953 enum PyUnicode_Kind outkind = writer->kind;
7954 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7955 if (outkind == PyUnicode_1BYTE_KIND) {
7956 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7957 Py_UCS4 maxchar = writer->maxchar;
7958 while (s < e) {
7959 ch = *s;
7960 x = mapdata_ucs2[ch];
7961 if (x > maxchar)
7962 goto Error;
7963 outdata[writer->pos] = x;
7964 writer->pos++;
7965 ++s;
7966 }
7967 break;
7968 }
7969 else if (outkind == PyUnicode_2BYTE_KIND) {
7970 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7971 while (s < e) {
7972 ch = *s;
7973 x = mapdata_ucs2[ch];
7974 if (x == 0xFFFE)
7975 goto Error;
7976 outdata[writer->pos] = x;
7977 writer->pos++;
7978 ++s;
7979 }
7980 break;
7981 }
7982 }
7983 ch = *s;
7984
7985 if (ch < maplen)
7986 x = PyUnicode_READ(mapkind, mapdata, ch);
7987 else
7988 x = 0xfffe; /* invalid value */
7989Error:
7990 if (x == 0xfffe)
7991 {
7992 /* undefined mapping */
7993 startinpos = s-starts;
7994 endinpos = startinpos+1;
7995 if (unicode_decode_call_errorhandler_writer(
7996 errors, &errorHandler,
7997 "charmap", "character maps to <undefined>",
7998 &starts, &e, &startinpos, &endinpos, &exc, &s,
7999 writer)) {
8000 goto onError;
8001 }
8002 continue;
8003 }
8004
8005 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8006 goto onError;
8007 ++s;
8008 }
8009 Py_XDECREF(errorHandler);
8010 Py_XDECREF(exc);
8011 return 0;
8012
8013onError:
8014 Py_XDECREF(errorHandler);
8015 Py_XDECREF(exc);
8016 return -1;
8017}
8018
8019static int
8020charmap_decode_mapping(const char *s,
8021 Py_ssize_t size,
8022 PyObject *mapping,
8023 const char *errors,
8024 _PyUnicodeWriter *writer)
8025{
8026 const char *starts = s;
8027 const char *e;
8028 Py_ssize_t startinpos, endinpos;
8029 PyObject *errorHandler = NULL, *exc = NULL;
8030 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008031 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008032
8033 e = s + size;
8034
8035 while (s < e) {
8036 ch = *s;
8037
8038 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8039 key = PyLong_FromLong((long)ch);
8040 if (key == NULL)
8041 goto onError;
8042
8043 item = PyObject_GetItem(mapping, key);
8044 Py_DECREF(key);
8045 if (item == NULL) {
8046 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8047 /* No mapping found means: mapping is undefined. */
8048 PyErr_Clear();
8049 goto Undefined;
8050 } else
8051 goto onError;
8052 }
8053
8054 /* Apply mapping */
8055 if (item == Py_None)
8056 goto Undefined;
8057 if (PyLong_Check(item)) {
8058 long value = PyLong_AS_LONG(item);
8059 if (value == 0xFFFE)
8060 goto Undefined;
8061 if (value < 0 || value > MAX_UNICODE) {
8062 PyErr_Format(PyExc_TypeError,
8063 "character mapping must be in range(0x%lx)",
8064 (unsigned long)MAX_UNICODE + 1);
8065 goto onError;
8066 }
8067
8068 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8069 goto onError;
8070 }
8071 else if (PyUnicode_Check(item)) {
8072 if (PyUnicode_READY(item) == -1)
8073 goto onError;
8074 if (PyUnicode_GET_LENGTH(item) == 1) {
8075 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8076 if (value == 0xFFFE)
8077 goto Undefined;
8078 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8079 goto onError;
8080 }
8081 else {
8082 writer->overallocate = 1;
8083 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8084 goto onError;
8085 }
8086 }
8087 else {
8088 /* wrong return value */
8089 PyErr_SetString(PyExc_TypeError,
8090 "character mapping must return integer, None or str");
8091 goto onError;
8092 }
8093 Py_CLEAR(item);
8094 ++s;
8095 continue;
8096
8097Undefined:
8098 /* undefined mapping */
8099 Py_CLEAR(item);
8100 startinpos = s-starts;
8101 endinpos = startinpos+1;
8102 if (unicode_decode_call_errorhandler_writer(
8103 errors, &errorHandler,
8104 "charmap", "character maps to <undefined>",
8105 &starts, &e, &startinpos, &endinpos, &exc, &s,
8106 writer)) {
8107 goto onError;
8108 }
8109 }
8110 Py_XDECREF(errorHandler);
8111 Py_XDECREF(exc);
8112 return 0;
8113
8114onError:
8115 Py_XDECREF(item);
8116 Py_XDECREF(errorHandler);
8117 Py_XDECREF(exc);
8118 return -1;
8119}
8120
Alexander Belopolsky40018472011-02-26 01:02:56 +00008121PyObject *
8122PyUnicode_DecodeCharmap(const char *s,
8123 Py_ssize_t size,
8124 PyObject *mapping,
8125 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008127 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 /* Default to Latin-1 */
8130 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008134 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008135 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008136 writer.min_length = size;
8137 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008139
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008140 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008141 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8142 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008143 }
8144 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008145 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008148 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008149
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008151 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 return NULL;
8153}
8154
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155/* Charmap encoding: the lookup table */
8156
Alexander Belopolsky40018472011-02-26 01:02:56 +00008157struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 PyObject_HEAD
8159 unsigned char level1[32];
8160 int count2, count3;
8161 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162};
8163
8164static PyObject*
8165encoding_map_size(PyObject *obj, PyObject* args)
8166{
8167 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170}
8171
8172static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 PyDoc_STR("Return the size (in bytes) of this object") },
8175 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176};
8177
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 "EncodingMap", /*tp_name*/
8181 sizeof(struct encoding_map), /*tp_basicsize*/
8182 0, /*tp_itemsize*/
8183 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008184 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008185 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 0, /*tp_getattr*/
8187 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008188 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 0, /*tp_repr*/
8190 0, /*tp_as_number*/
8191 0, /*tp_as_sequence*/
8192 0, /*tp_as_mapping*/
8193 0, /*tp_hash*/
8194 0, /*tp_call*/
8195 0, /*tp_str*/
8196 0, /*tp_getattro*/
8197 0, /*tp_setattro*/
8198 0, /*tp_as_buffer*/
8199 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8200 0, /*tp_doc*/
8201 0, /*tp_traverse*/
8202 0, /*tp_clear*/
8203 0, /*tp_richcompare*/
8204 0, /*tp_weaklistoffset*/
8205 0, /*tp_iter*/
8206 0, /*tp_iternext*/
8207 encoding_map_methods, /*tp_methods*/
8208 0, /*tp_members*/
8209 0, /*tp_getset*/
8210 0, /*tp_base*/
8211 0, /*tp_dict*/
8212 0, /*tp_descr_get*/
8213 0, /*tp_descr_set*/
8214 0, /*tp_dictoffset*/
8215 0, /*tp_init*/
8216 0, /*tp_alloc*/
8217 0, /*tp_new*/
8218 0, /*tp_free*/
8219 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220};
8221
8222PyObject*
8223PyUnicode_BuildEncodingMap(PyObject* string)
8224{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008225 PyObject *result;
8226 struct encoding_map *mresult;
8227 int i;
8228 int need_dict = 0;
8229 unsigned char level1[32];
8230 unsigned char level2[512];
8231 unsigned char *mlevel1, *mlevel2, *mlevel3;
8232 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 int kind;
8234 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008235 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008237
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008238 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239 PyErr_BadArgument();
8240 return NULL;
8241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 kind = PyUnicode_KIND(string);
8243 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 length = PyUnicode_GET_LENGTH(string);
8245 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 memset(level1, 0xFF, sizeof level1);
8247 memset(level2, 0xFF, sizeof level2);
8248
8249 /* If there isn't a one-to-one mapping of NULL to \0,
8250 or if there are non-BMP characters, we need to use
8251 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008254 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008256 ch = PyUnicode_READ(kind, data, i);
8257 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258 need_dict = 1;
8259 break;
8260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008261 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262 /* unmapped character */
8263 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264 l1 = ch >> 11;
8265 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 if (level1[l1] == 0xFF)
8267 level1[l1] = count2++;
8268 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008269 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270 }
8271
8272 if (count2 >= 0xFF || count3 >= 0xFF)
8273 need_dict = 1;
8274
8275 if (need_dict) {
8276 PyObject *result = PyDict_New();
8277 PyObject *key, *value;
8278 if (!result)
8279 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008280 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008282 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008283 if (!key || !value)
8284 goto failed1;
8285 if (PyDict_SetItem(result, key, value) == -1)
8286 goto failed1;
8287 Py_DECREF(key);
8288 Py_DECREF(value);
8289 }
8290 return result;
8291 failed1:
8292 Py_XDECREF(key);
8293 Py_XDECREF(value);
8294 Py_DECREF(result);
8295 return NULL;
8296 }
8297
8298 /* Create a three-level trie */
8299 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8300 16*count2 + 128*count3 - 1);
8301 if (!result)
8302 return PyErr_NoMemory();
8303 PyObject_Init(result, &EncodingMapType);
8304 mresult = (struct encoding_map*)result;
8305 mresult->count2 = count2;
8306 mresult->count3 = count3;
8307 mlevel1 = mresult->level1;
8308 mlevel2 = mresult->level23;
8309 mlevel3 = mresult->level23 + 16*count2;
8310 memcpy(mlevel1, level1, 32);
8311 memset(mlevel2, 0xFF, 16*count2);
8312 memset(mlevel3, 0, 128*count3);
8313 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008314 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008316 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8317 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 /* unmapped character */
8319 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008320 o1 = ch>>11;
8321 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 i2 = 16*mlevel1[o1] + o2;
8323 if (mlevel2[i2] == 0xFF)
8324 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008325 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 i3 = 128*mlevel2[i2] + o3;
8327 mlevel3[i3] = i;
8328 }
8329 return result;
8330}
8331
8332static int
Victor Stinner22168992011-11-20 17:09:18 +01008333encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334{
8335 struct encoding_map *map = (struct encoding_map*)mapping;
8336 int l1 = c>>11;
8337 int l2 = (c>>7) & 0xF;
8338 int l3 = c & 0x7F;
8339 int i;
8340
Victor Stinner22168992011-11-20 17:09:18 +01008341 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343 if (c == 0)
8344 return 0;
8345 /* level 1*/
8346 i = map->level1[l1];
8347 if (i == 0xFF) {
8348 return -1;
8349 }
8350 /* level 2*/
8351 i = map->level23[16*i+l2];
8352 if (i == 0xFF) {
8353 return -1;
8354 }
8355 /* level 3 */
8356 i = map->level23[16*map->count2 + 128*i + l3];
8357 if (i == 0) {
8358 return -1;
8359 }
8360 return i;
8361}
8362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363/* Lookup the character ch in the mapping. If the character
8364 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008365 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008367charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368{
Christian Heimes217cfd12007-12-02 14:31:20 +00008369 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 PyObject *x;
8371
8372 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 x = PyObject_GetItem(mapping, w);
8375 Py_DECREF(w);
8376 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8378 /* No mapping found means: mapping is undefined. */
8379 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008380 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 } else
8382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008384 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008386 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 long value = PyLong_AS_LONG(x);
8388 if (value < 0 || value > 255) {
8389 PyErr_SetString(PyExc_TypeError,
8390 "character mapping must be in range(256)");
8391 Py_DECREF(x);
8392 return NULL;
8393 }
8394 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008396 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 /* wrong return value */
8400 PyErr_Format(PyExc_TypeError,
8401 "character mapping must return integer, bytes or None, not %.400s",
8402 x->ob_type->tp_name);
8403 Py_DECREF(x);
8404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 }
8406}
8407
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008408static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008409charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008411 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8412 /* exponentially overallocate to minimize reallocations */
8413 if (requiredsize < 2*outsize)
8414 requiredsize = 2*outsize;
8415 if (_PyBytes_Resize(outobj, requiredsize))
8416 return -1;
8417 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008418}
8419
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008424 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 space is available. Return a new reference to the object that
8426 was put in the output buffer, or Py_None, if the mapping was undefined
8427 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008428 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008429static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008430charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008431 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008433 PyObject *rep;
8434 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008435 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436
Christian Heimes90aa7642007-12-19 02:45:37 +00008437 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008438 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008440 if (res == -1)
8441 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 if (outsize<requiredsize)
8443 if (charmapencode_resize(outobj, outpos, requiredsize))
8444 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008445 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 outstart[(*outpos)++] = (char)res;
8447 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008448 }
8449
8450 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 Py_DECREF(rep);
8455 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008456 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if (PyLong_Check(rep)) {
8458 Py_ssize_t requiredsize = *outpos+1;
8459 if (outsize<requiredsize)
8460 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8461 Py_DECREF(rep);
8462 return enc_EXCEPTION;
8463 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008464 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008466 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 else {
8468 const char *repchars = PyBytes_AS_STRING(rep);
8469 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8470 Py_ssize_t requiredsize = *outpos+repsize;
8471 if (outsize<requiredsize)
8472 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8473 Py_DECREF(rep);
8474 return enc_EXCEPTION;
8475 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008476 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 memcpy(outstart + *outpos, repchars, repsize);
8478 *outpos += repsize;
8479 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008481 Py_DECREF(rep);
8482 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483}
8484
8485/* handle an error in PyUnicode_EncodeCharmap
8486 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008487static int
8488charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008489 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008491 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008492 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493{
8494 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008496 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008497 enum PyUnicode_Kind kind;
8498 void *data;
8499 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008501 Py_ssize_t collstartpos = *inpos;
8502 Py_ssize_t collendpos = *inpos+1;
8503 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008504 const char *encoding = "charmap";
8505 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008506 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008508 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509
Benjamin Petersonbac79492012-01-14 13:34:47 -05008510 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008511 return -1;
8512 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 /* find all unencodable characters */
8514 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008515 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008516 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008517 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008518 val = encoding_map_lookup(ch, mapping);
8519 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 break;
8521 ++collendpos;
8522 continue;
8523 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008524
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008525 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8526 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 if (rep==NULL)
8528 return -1;
8529 else if (rep!=Py_None) {
8530 Py_DECREF(rep);
8531 break;
8532 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008533 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 }
8536 /* cache callback name lookup
8537 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008538 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008539 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008540
8541 switch (*error_handler) {
8542 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008543 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008545
8546 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008547 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 x = charmapencode_output('?', mapping, res, respos);
8549 if (x==enc_EXCEPTION) {
8550 return -1;
8551 }
8552 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008553 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 return -1;
8555 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 }
8557 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008558 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 *inpos = collendpos;
8560 break;
Victor Stinner50149202015-09-22 00:26:54 +02008561
8562 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008563 /* generate replacement (temporarily (mis)uses p) */
8564 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 char buffer[2+29+1+1];
8566 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008567 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 for (cp = buffer; *cp; ++cp) {
8569 x = charmapencode_output(*cp, mapping, res, respos);
8570 if (x==enc_EXCEPTION)
8571 return -1;
8572 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008573 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 return -1;
8575 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008576 }
8577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 *inpos = collendpos;
8579 break;
Victor Stinner50149202015-09-22 00:26:54 +02008580
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 default:
Victor Stinner50149202015-09-22 00:26:54 +02008582 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008585 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008587 if (PyBytes_Check(repunicode)) {
8588 /* Directly copy bytes result to output. */
8589 Py_ssize_t outsize = PyBytes_Size(*res);
8590 Py_ssize_t requiredsize;
8591 repsize = PyBytes_Size(repunicode);
8592 requiredsize = *respos + repsize;
8593 if (requiredsize > outsize)
8594 /* Make room for all additional bytes. */
8595 if (charmapencode_resize(res, respos, requiredsize)) {
8596 Py_DECREF(repunicode);
8597 return -1;
8598 }
8599 memcpy(PyBytes_AsString(*res) + *respos,
8600 PyBytes_AsString(repunicode), repsize);
8601 *respos += repsize;
8602 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008603 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008604 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008606 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008607 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008608 Py_DECREF(repunicode);
8609 return -1;
8610 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008611 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008612 data = PyUnicode_DATA(repunicode);
8613 kind = PyUnicode_KIND(repunicode);
8614 for (index = 0; index < repsize; index++) {
8615 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8616 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008618 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 return -1;
8620 }
8621 else if (x==enc_FAILED) {
8622 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008623 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 return -1;
8625 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008626 }
8627 *inpos = newpos;
8628 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 }
8630 return 0;
8631}
8632
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008634_PyUnicode_EncodeCharmap(PyObject *unicode,
8635 PyObject *mapping,
8636 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638 /* output object */
8639 PyObject *res = NULL;
8640 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008641 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008642 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008644 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008645 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008647 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008648 void *data;
8649 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650
Benjamin Petersonbac79492012-01-14 13:34:47 -05008651 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008652 return NULL;
8653 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008654 data = PyUnicode_DATA(unicode);
8655 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008656
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 /* Default to Latin-1 */
8658 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008659 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 /* allocate enough for a simple encoding without
8662 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008663 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664 if (res == NULL)
8665 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008666 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008670 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008672 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 if (x==enc_EXCEPTION) /* error */
8674 goto onError;
8675 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008676 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008678 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 &res, &respos)) {
8680 goto onError;
8681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008682 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 else
8684 /* done with this character => adjust input position */
8685 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008689 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008690 if (_PyBytes_Resize(&res, respos) < 0)
8691 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008694 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 return res;
8696
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 Py_XDECREF(res);
8699 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008700 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 return NULL;
8702}
8703
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008704/* Deprecated */
8705PyObject *
8706PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8707 Py_ssize_t size,
8708 PyObject *mapping,
8709 const char *errors)
8710{
8711 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008712 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008713 if (unicode == NULL)
8714 return NULL;
8715 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8716 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008717 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008718}
8719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720PyObject *
8721PyUnicode_AsCharmapString(PyObject *unicode,
8722 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723{
8724 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 PyErr_BadArgument();
8726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008728 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729}
8730
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008732static void
8733make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008735 Py_ssize_t startpos, Py_ssize_t endpos,
8736 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 *exceptionObject = _PyUnicodeTranslateError_Create(
8740 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 }
8742 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8744 goto onError;
8745 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8746 goto onError;
8747 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8748 goto onError;
8749 return;
8750 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008751 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 }
8753}
8754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755/* error handling callback helper:
8756 build arguments, call the callback and check the arguments,
8757 put the result into newpos and return the replacement string, which
8758 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008759static PyObject *
8760unicode_translate_call_errorhandler(const char *errors,
8761 PyObject **errorHandler,
8762 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008764 Py_ssize_t startpos, Py_ssize_t endpos,
8765 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008767 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008769 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008770 PyObject *restuple;
8771 PyObject *resunicode;
8772
8773 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 }
8778
8779 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008784 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008788 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 Py_DECREF(restuple);
8790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008792 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 &resunicode, &i_newpos)) {
8794 Py_DECREF(restuple);
8795 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008797 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008799 else
8800 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008802 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 Py_DECREF(restuple);
8804 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806 Py_INCREF(resunicode);
8807 Py_DECREF(restuple);
8808 return resunicode;
8809}
8810
8811/* Lookup the character ch in the mapping and put the result in result,
8812 which must be decrefed by the caller.
8813 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008814static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816{
Christian Heimes217cfd12007-12-02 14:31:20 +00008817 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818 PyObject *x;
8819
8820 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 x = PyObject_GetItem(mapping, w);
8823 Py_DECREF(w);
8824 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8826 /* No mapping found means: use 1:1 mapping. */
8827 PyErr_Clear();
8828 *result = NULL;
8829 return 0;
8830 } else
8831 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 }
8833 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 *result = x;
8835 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008837 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008839 if (value < 0 || value > MAX_UNICODE) {
8840 PyErr_Format(PyExc_ValueError,
8841 "character mapping must be in range(0x%x)",
8842 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 Py_DECREF(x);
8844 return -1;
8845 }
8846 *result = x;
8847 return 0;
8848 }
8849 else if (PyUnicode_Check(x)) {
8850 *result = x;
8851 return 0;
8852 }
8853 else {
8854 /* wrong return value */
8855 PyErr_SetString(PyExc_TypeError,
8856 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 Py_DECREF(x);
8858 return -1;
8859 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008860}
Victor Stinner1194ea02014-04-04 19:37:40 +02008861
8862/* lookup the character, write the result into the writer.
8863 Return 1 if the result was written into the writer, return 0 if the mapping
8864 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008865static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008866charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8867 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008868{
Victor Stinner1194ea02014-04-04 19:37:40 +02008869 PyObject *item;
8870
8871 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008873
8874 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008876 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008879 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008881
8882 if (item == Py_None) {
8883 Py_DECREF(item);
8884 return 0;
8885 }
8886
8887 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008888 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8889 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8890 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008891 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8892 Py_DECREF(item);
8893 return -1;
8894 }
8895 Py_DECREF(item);
8896 return 1;
8897 }
8898
8899 if (!PyUnicode_Check(item)) {
8900 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008902 }
8903
8904 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8905 Py_DECREF(item);
8906 return -1;
8907 }
8908
8909 Py_DECREF(item);
8910 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008911}
8912
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913static int
8914unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8915 Py_UCS1 *translate)
8916{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008917 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918 int ret = 0;
8919
Victor Stinner89a76ab2014-04-05 11:44:04 +02008920 if (charmaptranslate_lookup(ch, mapping, &item)) {
8921 return -1;
8922 }
8923
8924 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008925 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008926 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008927 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008928 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 /* not found => default to 1:1 mapping */
8930 translate[ch] = ch;
8931 return 1;
8932 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008933 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008934 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008935 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8936 used it */
8937 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008938 /* invalid character or character outside ASCII:
8939 skip the fast translate */
8940 goto exit;
8941 }
8942 translate[ch] = (Py_UCS1)replace;
8943 }
8944 else if (PyUnicode_Check(item)) {
8945 Py_UCS4 replace;
8946
8947 if (PyUnicode_READY(item) == -1) {
8948 Py_DECREF(item);
8949 return -1;
8950 }
8951 if (PyUnicode_GET_LENGTH(item) != 1)
8952 goto exit;
8953
8954 replace = PyUnicode_READ_CHAR(item, 0);
8955 if (replace > 127)
8956 goto exit;
8957 translate[ch] = (Py_UCS1)replace;
8958 }
8959 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008960 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008961 goto exit;
8962 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008963 ret = 1;
8964
Benjamin Peterson1365de72014-04-07 20:15:41 -04008965 exit:
8966 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008967 return ret;
8968}
8969
8970/* Fast path for ascii => ascii translation. Return 1 if the whole string
8971 was translated into writer, return 0 if the input string was partially
8972 translated into writer, raise an exception and return -1 on error. */
8973static int
8974unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008975 _PyUnicodeWriter *writer, int ignore,
8976 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977{
Victor Stinner872b2912014-04-05 14:27:07 +02008978 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979 Py_ssize_t len;
8980 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008981 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982
Victor Stinner89a76ab2014-04-05 11:44:04 +02008983 len = PyUnicode_GET_LENGTH(input);
8984
Victor Stinner872b2912014-04-05 14:27:07 +02008985 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008986
8987 in = PyUnicode_1BYTE_DATA(input);
8988 end = in + len;
8989
8990 assert(PyUnicode_IS_ASCII(writer->buffer));
8991 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8992 out = PyUnicode_1BYTE_DATA(writer->buffer);
8993
Victor Stinner872b2912014-04-05 14:27:07 +02008994 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008995 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008996 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008998 int translate = unicode_fast_translate_lookup(mapping, ch,
8999 ascii_table);
9000 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009001 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009002 if (translate == 0)
9003 goto exit;
9004 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009005 }
Victor Stinner872b2912014-04-05 14:27:07 +02009006 if (ch2 == 0xfe) {
9007 if (ignore)
9008 continue;
9009 goto exit;
9010 }
9011 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009013 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 }
Victor Stinner872b2912014-04-05 14:27:07 +02009015 res = 1;
9016
9017exit:
9018 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009019 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009020 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009021}
9022
Victor Stinner3222da22015-10-01 22:07:32 +02009023static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024_PyUnicode_TranslateCharmap(PyObject *input,
9025 PyObject *mapping,
9026 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 Py_ssize_t size, i;
9031 int kind;
9032 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009033 _PyUnicodeWriter writer;
9034 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009035 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009036 PyObject *errorHandler = NULL;
9037 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009038 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009039 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009040
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 PyErr_BadArgument();
9043 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 if (PyUnicode_READY(input) == -1)
9047 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009048 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 kind = PyUnicode_KIND(input);
9050 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009052 if (size == 0)
9053 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009055 /* allocate enough for a simple 1:1 translation without
9056 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009057 _PyUnicodeWriter_Init(&writer);
9058 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060
Victor Stinner872b2912014-04-05 14:27:07 +02009061 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9062
Victor Stinner33798672016-03-01 21:59:58 +01009063 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009064 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009065 if (PyUnicode_IS_ASCII(input)) {
9066 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9067 if (res < 0) {
9068 _PyUnicodeWriter_Dealloc(&writer);
9069 return NULL;
9070 }
9071 if (res == 1)
9072 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073 }
Victor Stinner33798672016-03-01 21:59:58 +01009074 else {
9075 i = 0;
9076 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009080 int translate;
9081 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9082 Py_ssize_t newpos;
9083 /* startpos for collecting untranslatable chars */
9084 Py_ssize_t collstart;
9085 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009086 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087
Victor Stinner1194ea02014-04-04 19:37:40 +02009088 ch = PyUnicode_READ(kind, data, i);
9089 translate = charmaptranslate_output(ch, mapping, &writer);
9090 if (translate < 0)
9091 goto onError;
9092
9093 if (translate != 0) {
9094 /* it worked => adjust input pointer */
9095 ++i;
9096 continue;
9097 }
9098
9099 /* untranslatable character */
9100 collstart = i;
9101 collend = i+1;
9102
9103 /* find all untranslatable characters */
9104 while (collend < size) {
9105 PyObject *x;
9106 ch = PyUnicode_READ(kind, data, collend);
9107 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009108 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009109 Py_XDECREF(x);
9110 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009112 ++collend;
9113 }
9114
9115 if (ignore) {
9116 i = collend;
9117 }
9118 else {
9119 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9120 reason, input, &exc,
9121 collstart, collend, &newpos);
9122 if (repunicode == NULL)
9123 goto onError;
9124 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009126 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009127 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009128 Py_DECREF(repunicode);
9129 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009130 }
9131 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009132 Py_XDECREF(exc);
9133 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009137 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009138 Py_XDECREF(exc);
9139 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 return NULL;
9141}
9142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143/* Deprecated. Use PyUnicode_Translate instead. */
9144PyObject *
9145PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9146 Py_ssize_t size,
9147 PyObject *mapping,
9148 const char *errors)
9149{
Christian Heimes5f520f42012-09-11 14:03:25 +02009150 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009151 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 if (!unicode)
9153 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009154 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9155 Py_DECREF(unicode);
9156 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157}
9158
Alexander Belopolsky40018472011-02-26 01:02:56 +00009159PyObject *
9160PyUnicode_Translate(PyObject *str,
9161 PyObject *mapping,
9162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009164 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009165 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009166 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167}
Tim Petersced69f82003-09-16 20:30:58 +00009168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009169PyObject *
9170_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9171{
9172 if (!PyUnicode_Check(unicode)) {
9173 PyErr_BadInternalCall();
9174 return NULL;
9175 }
9176 if (PyUnicode_READY(unicode) == -1)
9177 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009178 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 /* If the string is already ASCII, just return the same string */
9180 Py_INCREF(unicode);
9181 return unicode;
9182 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009183
9184 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9185 PyObject *result = PyUnicode_New(len, 127);
9186 if (result == NULL) {
9187 return NULL;
9188 }
9189
9190 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9191 int kind = PyUnicode_KIND(unicode);
9192 const void *data = PyUnicode_DATA(unicode);
9193 Py_ssize_t i;
9194 for (i = 0; i < len; ++i) {
9195 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9196 if (ch < 127) {
9197 out[i] = ch;
9198 }
9199 else if (Py_UNICODE_ISSPACE(ch)) {
9200 out[i] = ' ';
9201 }
9202 else {
9203 int decimal = Py_UNICODE_TODECIMAL(ch);
9204 if (decimal < 0) {
9205 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009206 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009207 _PyUnicode_LENGTH(result) = i + 1;
9208 break;
9209 }
9210 out[i] = '0' + decimal;
9211 }
9212 }
9213
INADA Naoki16dfca42018-07-14 12:06:43 +09009214 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009215 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216}
9217
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009218PyObject *
9219PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9220 Py_ssize_t length)
9221{
Victor Stinnerf0124502011-11-21 23:12:56 +01009222 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009223 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009224 Py_UCS4 maxchar;
9225 enum PyUnicode_Kind kind;
9226 void *data;
9227
Victor Stinner99d7ad02012-02-22 13:37:39 +01009228 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009229 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009230 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009231 if (ch > 127) {
9232 int decimal = Py_UNICODE_TODECIMAL(ch);
9233 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009234 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009235 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009236 }
9237 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009238
9239 /* Copy to a new string */
9240 decimal = PyUnicode_New(length, maxchar);
9241 if (decimal == NULL)
9242 return decimal;
9243 kind = PyUnicode_KIND(decimal);
9244 data = PyUnicode_DATA(decimal);
9245 /* Iterate over code points */
9246 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009247 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009248 if (ch > 127) {
9249 int decimal = Py_UNICODE_TODECIMAL(ch);
9250 if (decimal >= 0)
9251 ch = '0' + decimal;
9252 }
9253 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009255 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009256}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009257/* --- Decimal Encoder ---------------------------------------------------- */
9258
Alexander Belopolsky40018472011-02-26 01:02:56 +00009259int
9260PyUnicode_EncodeDecimal(Py_UNICODE *s,
9261 Py_ssize_t length,
9262 char *output,
9263 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009264{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009265 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009266 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009267 enum PyUnicode_Kind kind;
9268 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009269
9270 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 PyErr_BadArgument();
9272 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009273 }
9274
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009275 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009276 if (unicode == NULL)
9277 return -1;
9278
Victor Stinner42bf7752011-11-21 22:52:58 +01009279 kind = PyUnicode_KIND(unicode);
9280 data = PyUnicode_DATA(unicode);
9281
Victor Stinnerb84d7232011-11-22 01:50:07 +01009282 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009283 PyObject *exc;
9284 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009285 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009286 Py_ssize_t startpos;
9287
9288 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009289
Benjamin Peterson29060642009-01-31 22:14:21 +00009290 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009291 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009292 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009294 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 decimal = Py_UNICODE_TODECIMAL(ch);
9296 if (decimal >= 0) {
9297 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009298 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 continue;
9300 }
9301 if (0 < ch && ch < 256) {
9302 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009303 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 continue;
9305 }
Victor Stinner6345be92011-11-25 20:09:01 +01009306
Victor Stinner42bf7752011-11-21 22:52:58 +01009307 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009308 exc = NULL;
9309 raise_encode_exception(&exc, "decimal", unicode,
9310 startpos, startpos+1,
9311 "invalid decimal Unicode string");
9312 Py_XDECREF(exc);
9313 Py_DECREF(unicode);
9314 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009315 }
9316 /* 0-terminate the output string */
9317 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009318 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009319 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009320}
9321
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322/* --- Helpers ------------------------------------------------------------ */
9323
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009324/* helper macro to fixup start/end slice values */
9325#define ADJUST_INDICES(start, end, len) \
9326 if (end > len) \
9327 end = len; \
9328 else if (end < 0) { \
9329 end += len; \
9330 if (end < 0) \
9331 end = 0; \
9332 } \
9333 if (start < 0) { \
9334 start += len; \
9335 if (start < 0) \
9336 start = 0; \
9337 }
9338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009340any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009342 Py_ssize_t end,
9343 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009345 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 void *buf1, *buf2;
9347 Py_ssize_t len1, len2, result;
9348
9349 kind1 = PyUnicode_KIND(s1);
9350 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009351 if (kind1 < kind2)
9352 return -1;
9353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 len1 = PyUnicode_GET_LENGTH(s1);
9355 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009356 ADJUST_INDICES(start, end, len1);
9357 if (end - start < len2)
9358 return -1;
9359
9360 buf1 = PyUnicode_DATA(s1);
9361 buf2 = PyUnicode_DATA(s2);
9362 if (len2 == 1) {
9363 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9364 result = findchar((const char *)buf1 + kind1*start,
9365 kind1, end - start, ch, direction);
9366 if (result == -1)
9367 return -1;
9368 else
9369 return start + result;
9370 }
9371
9372 if (kind2 != kind1) {
9373 buf2 = _PyUnicode_AsKind(s2, kind1);
9374 if (!buf2)
9375 return -2;
9376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377
Victor Stinner794d5672011-10-10 03:21:36 +02009378 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009379 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009380 case PyUnicode_1BYTE_KIND:
9381 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9382 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9383 else
9384 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9385 break;
9386 case PyUnicode_2BYTE_KIND:
9387 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9388 break;
9389 case PyUnicode_4BYTE_KIND:
9390 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9391 break;
9392 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009393 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009394 }
9395 }
9396 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009397 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009398 case PyUnicode_1BYTE_KIND:
9399 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9400 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9401 else
9402 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9403 break;
9404 case PyUnicode_2BYTE_KIND:
9405 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9406 break;
9407 case PyUnicode_4BYTE_KIND:
9408 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9409 break;
9410 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009411 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 }
9414
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009415 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416 PyMem_Free(buf2);
9417
9418 return result;
9419}
9420
Victor Stinner59423e32018-11-26 13:40:01 +01009421/* _PyUnicode_InsertThousandsGrouping() helper functions */
9422#include "stringlib/localeutil.h"
9423
9424/**
9425 * InsertThousandsGrouping:
9426 * @writer: Unicode writer.
9427 * @n_buffer: Number of characters in @buffer.
9428 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9429 * @d_pos: Start of digits string.
9430 * @n_digits: The number of digits in the string, in which we want
9431 * to put the grouping chars.
9432 * @min_width: The minimum width of the digits in the output string.
9433 * Output will be zero-padded on the left to fill.
9434 * @grouping: see definition in localeconv().
9435 * @thousands_sep: see definition in localeconv().
9436 *
9437 * There are 2 modes: counting and filling. If @writer is NULL,
9438 * we are in counting mode, else filling mode.
9439 * If counting, the required buffer size is returned.
9440 * If filling, we know the buffer will be large enough, so we don't
9441 * need to pass in the buffer size.
9442 * Inserts thousand grouping characters (as defined by grouping and
9443 * thousands_sep) into @writer.
9444 *
9445 * Return value: -1 on error, number of characters otherwise.
9446 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009448_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009449 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009450 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009451 PyObject *digits,
9452 Py_ssize_t d_pos,
9453 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009454 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009455 const char *grouping,
9456 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009457 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458{
Xtreak3f7983a2019-01-07 20:39:14 +05309459 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009460 if (writer) {
9461 assert(digits != NULL);
9462 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009463 }
9464 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009465 assert(digits == NULL);
9466 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009467 }
Victor Stinner59423e32018-11-26 13:40:01 +01009468 assert(0 <= d_pos);
9469 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009470 assert(grouping != NULL);
9471
9472 if (digits != NULL) {
9473 if (PyUnicode_READY(digits) == -1) {
9474 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009475 }
Victor Stinner59423e32018-11-26 13:40:01 +01009476 }
9477 if (PyUnicode_READY(thousands_sep) == -1) {
9478 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009479 }
9480
Victor Stinner59423e32018-11-26 13:40:01 +01009481 Py_ssize_t count = 0;
9482 Py_ssize_t n_zeros;
9483 int loop_broken = 0;
9484 int use_separator = 0; /* First time through, don't append the
9485 separator. They only go between
9486 groups. */
9487 Py_ssize_t buffer_pos;
9488 Py_ssize_t digits_pos;
9489 Py_ssize_t len;
9490 Py_ssize_t n_chars;
9491 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9492 be looked at */
9493 /* A generator that returns all of the grouping widths, until it
9494 returns 0. */
9495 GroupGenerator groupgen;
9496 GroupGenerator_init(&groupgen, grouping);
9497 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9498
9499 /* if digits are not grouped, thousands separator
9500 should be an empty string */
9501 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9502
9503 digits_pos = d_pos + n_digits;
9504 if (writer) {
9505 buffer_pos = writer->pos + n_buffer;
9506 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9507 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 }
Victor Stinner59423e32018-11-26 13:40:01 +01009509 else {
9510 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009511 }
Victor Stinner59423e32018-11-26 13:40:01 +01009512
9513 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009514 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009515 }
Victor Stinner59423e32018-11-26 13:40:01 +01009516
9517 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9518 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9519 n_zeros = Py_MAX(0, len - remaining);
9520 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9521
9522 /* Use n_zero zero's and n_chars chars */
9523
9524 /* Count only, don't do anything. */
9525 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9526
9527 /* Copy into the writer. */
9528 InsertThousandsGrouping_fill(writer, &buffer_pos,
9529 digits, &digits_pos,
9530 n_chars, n_zeros,
9531 use_separator ? thousands_sep : NULL,
9532 thousands_sep_len, maxchar);
9533
9534 /* Use a separator next time. */
9535 use_separator = 1;
9536
9537 remaining -= n_chars;
9538 min_width -= len;
9539
9540 if (remaining <= 0 && min_width <= 0) {
9541 loop_broken = 1;
9542 break;
9543 }
9544 min_width -= thousands_sep_len;
9545 }
9546 if (!loop_broken) {
9547 /* We left the loop without using a break statement. */
9548
9549 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9550 n_zeros = Py_MAX(0, len - remaining);
9551 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9552
9553 /* Use n_zero zero's and n_chars chars */
9554 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9555
9556 /* Copy into the writer. */
9557 InsertThousandsGrouping_fill(writer, &buffer_pos,
9558 digits, &digits_pos,
9559 n_chars, n_zeros,
9560 use_separator ? thousands_sep : NULL,
9561 thousands_sep_len, maxchar);
9562 }
9563 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564}
9565
9566
Alexander Belopolsky40018472011-02-26 01:02:56 +00009567Py_ssize_t
9568PyUnicode_Count(PyObject *str,
9569 PyObject *substr,
9570 Py_ssize_t start,
9571 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009573 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009574 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 void *buf1 = NULL, *buf2 = NULL;
9576 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009577
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009578 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009580
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009581 kind1 = PyUnicode_KIND(str);
9582 kind2 = PyUnicode_KIND(substr);
9583 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009584 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009585
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009586 len1 = PyUnicode_GET_LENGTH(str);
9587 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009589 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009590 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009591
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009592 buf1 = PyUnicode_DATA(str);
9593 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009594 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009595 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596 if (!buf2)
9597 goto onError;
9598 }
9599
9600 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009602 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009603 result = asciilib_count(
9604 ((Py_UCS1*)buf1) + start, end - start,
9605 buf2, len2, PY_SSIZE_T_MAX
9606 );
9607 else
9608 result = ucs1lib_count(
9609 ((Py_UCS1*)buf1) + start, end - start,
9610 buf2, len2, PY_SSIZE_T_MAX
9611 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 break;
9613 case PyUnicode_2BYTE_KIND:
9614 result = ucs2lib_count(
9615 ((Py_UCS2*)buf1) + start, end - start,
9616 buf2, len2, PY_SSIZE_T_MAX
9617 );
9618 break;
9619 case PyUnicode_4BYTE_KIND:
9620 result = ucs4lib_count(
9621 ((Py_UCS4*)buf1) + start, end - start,
9622 buf2, len2, PY_SSIZE_T_MAX
9623 );
9624 break;
9625 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009626 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009628
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009629 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 PyMem_Free(buf2);
9631
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009634 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 PyMem_Free(buf2);
9636 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Alexander Belopolsky40018472011-02-26 01:02:56 +00009639Py_ssize_t
9640PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009641 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009642 Py_ssize_t start,
9643 Py_ssize_t end,
9644 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009646 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009648
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009649 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650}
9651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652Py_ssize_t
9653PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9654 Py_ssize_t start, Py_ssize_t end,
9655 int direction)
9656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009658 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 if (PyUnicode_READY(str) == -1)
9660 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009661 len = PyUnicode_GET_LENGTH(str);
9662 ADJUST_INDICES(start, end, len);
9663 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009664 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009666 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9667 kind, end-start, ch, direction);
9668 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009670 else
9671 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672}
9673
Alexander Belopolsky40018472011-02-26 01:02:56 +00009674static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009675tailmatch(PyObject *self,
9676 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009677 Py_ssize_t start,
9678 Py_ssize_t end,
9679 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 int kind_self;
9682 int kind_sub;
9683 void *data_self;
9684 void *data_sub;
9685 Py_ssize_t offset;
9686 Py_ssize_t i;
9687 Py_ssize_t end_sub;
9688
9689 if (PyUnicode_READY(self) == -1 ||
9690 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009691 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9694 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009698 if (PyUnicode_GET_LENGTH(substring) == 0)
9699 return 1;
9700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 kind_self = PyUnicode_KIND(self);
9702 data_self = PyUnicode_DATA(self);
9703 kind_sub = PyUnicode_KIND(substring);
9704 data_sub = PyUnicode_DATA(substring);
9705 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9706
9707 if (direction > 0)
9708 offset = end;
9709 else
9710 offset = start;
9711
9712 if (PyUnicode_READ(kind_self, data_self, offset) ==
9713 PyUnicode_READ(kind_sub, data_sub, 0) &&
9714 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9715 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9716 /* If both are of the same kind, memcmp is sufficient */
9717 if (kind_self == kind_sub) {
9718 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009719 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 data_sub,
9721 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009722 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009724 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 else {
9726 /* We do not need to compare 0 and len(substring)-1 because
9727 the if statement above ensured already that they are equal
9728 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 for (i = 1; i < end_sub; ++i) {
9730 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9731 PyUnicode_READ(kind_sub, data_sub, i))
9732 return 0;
9733 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009734 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736 }
9737
9738 return 0;
9739}
9740
Alexander Belopolsky40018472011-02-26 01:02:56 +00009741Py_ssize_t
9742PyUnicode_Tailmatch(PyObject *str,
9743 PyObject *substr,
9744 Py_ssize_t start,
9745 Py_ssize_t end,
9746 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009748 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009750
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009751 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752}
9753
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009754static PyObject *
9755ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9758 char *resdata, *data = PyUnicode_DATA(self);
9759 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009760
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 res = PyUnicode_New(len, 127);
9762 if (res == NULL)
9763 return NULL;
9764 resdata = PyUnicode_DATA(res);
9765 if (lower)
9766 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768 _Py_bytes_upper(resdata, data, len);
9769 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770}
9771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009775 Py_ssize_t j;
9776 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009777 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009778 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009779
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009780 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9781
9782 where ! is a negation and \p{xxx} is a character with property xxx.
9783 */
9784 for (j = i - 1; j >= 0; j--) {
9785 c = PyUnicode_READ(kind, data, j);
9786 if (!_PyUnicode_IsCaseIgnorable(c))
9787 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009789 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9790 if (final_sigma) {
9791 for (j = i + 1; j < length; j++) {
9792 c = PyUnicode_READ(kind, data, j);
9793 if (!_PyUnicode_IsCaseIgnorable(c))
9794 break;
9795 }
9796 final_sigma = j == length || !_PyUnicode_IsCased(c);
9797 }
9798 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799}
9800
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009801static int
9802lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9803 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009805 /* Obscure special case. */
9806 if (c == 0x3A3) {
9807 mapped[0] = handle_capital_sigma(kind, data, length, i);
9808 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009810 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811}
9812
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009813static Py_ssize_t
9814do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 Py_ssize_t i, k = 0;
9817 int n_res, j;
9818 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009819
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009821 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009823 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009825 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826 for (i = 1; i < length; i++) {
9827 c = PyUnicode_READ(kind, data, i);
9828 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009831 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009832 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009833 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009834 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835}
9836
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837static Py_ssize_t
9838do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9839 Py_ssize_t i, k = 0;
9840
9841 for (i = 0; i < length; i++) {
9842 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9843 int n_res, j;
9844 if (Py_UNICODE_ISUPPER(c)) {
9845 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9846 }
9847 else if (Py_UNICODE_ISLOWER(c)) {
9848 n_res = _PyUnicode_ToUpperFull(c, mapped);
9849 }
9850 else {
9851 n_res = 1;
9852 mapped[0] = c;
9853 }
9854 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009855 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009856 res[k++] = mapped[j];
9857 }
9858 }
9859 return k;
9860}
9861
9862static Py_ssize_t
9863do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9864 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009866 Py_ssize_t i, k = 0;
9867
9868 for (i = 0; i < length; i++) {
9869 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9870 int n_res, j;
9871 if (lower)
9872 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9873 else
9874 n_res = _PyUnicode_ToUpperFull(c, mapped);
9875 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009876 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009877 res[k++] = mapped[j];
9878 }
9879 }
9880 return k;
9881}
9882
9883static Py_ssize_t
9884do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9885{
9886 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9887}
9888
9889static Py_ssize_t
9890do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9891{
9892 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9893}
9894
Benjamin Petersone51757f2012-01-12 21:10:29 -05009895static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009896do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9897{
9898 Py_ssize_t i, k = 0;
9899
9900 for (i = 0; i < length; i++) {
9901 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9902 Py_UCS4 mapped[3];
9903 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9904 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009905 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009906 res[k++] = mapped[j];
9907 }
9908 }
9909 return k;
9910}
9911
9912static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009913do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9914{
9915 Py_ssize_t i, k = 0;
9916 int previous_is_cased;
9917
9918 previous_is_cased = 0;
9919 for (i = 0; i < length; i++) {
9920 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9921 Py_UCS4 mapped[3];
9922 int n_res, j;
9923
9924 if (previous_is_cased)
9925 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9926 else
9927 n_res = _PyUnicode_ToTitleFull(c, mapped);
9928
9929 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009930 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009931 res[k++] = mapped[j];
9932 }
9933
9934 previous_is_cased = _PyUnicode_IsCased(c);
9935 }
9936 return k;
9937}
9938
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009939static PyObject *
9940case_operation(PyObject *self,
9941 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9942{
9943 PyObject *res = NULL;
9944 Py_ssize_t length, newlength = 0;
9945 int kind, outkind;
9946 void *data, *outdata;
9947 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9948
Benjamin Petersoneea48462012-01-16 14:28:50 -05009949 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009950
9951 kind = PyUnicode_KIND(self);
9952 data = PyUnicode_DATA(self);
9953 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009954 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009955 PyErr_SetString(PyExc_OverflowError, "string is too long");
9956 return NULL;
9957 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009958 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009959 if (tmp == NULL)
9960 return PyErr_NoMemory();
9961 newlength = perform(kind, data, length, tmp, &maxchar);
9962 res = PyUnicode_New(newlength, maxchar);
9963 if (res == NULL)
9964 goto leave;
9965 tmpend = tmp + newlength;
9966 outdata = PyUnicode_DATA(res);
9967 outkind = PyUnicode_KIND(res);
9968 switch (outkind) {
9969 case PyUnicode_1BYTE_KIND:
9970 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9971 break;
9972 case PyUnicode_2BYTE_KIND:
9973 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9974 break;
9975 case PyUnicode_4BYTE_KIND:
9976 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9977 break;
9978 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009979 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009980 }
9981 leave:
9982 PyMem_FREE(tmp);
9983 return res;
9984}
9985
Tim Peters8ce9f162004-08-27 01:49:32 +00009986PyObject *
9987PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009989 PyObject *res;
9990 PyObject *fseq;
9991 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009992 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009994 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009995 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009996 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009997 }
9998
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 /* NOTE: the following code can't call back into Python code,
10000 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010001 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010003 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010004 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010005 res = _PyUnicode_JoinArray(separator, items, seqlen);
10006 Py_DECREF(fseq);
10007 return res;
10008}
10009
10010PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010011_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010012{
10013 PyObject *res = NULL; /* the result */
10014 PyObject *sep = NULL;
10015 Py_ssize_t seplen;
10016 PyObject *item;
10017 Py_ssize_t sz, i, res_offset;
10018 Py_UCS4 maxchar;
10019 Py_UCS4 item_maxchar;
10020 int use_memcpy;
10021 unsigned char *res_data = NULL, *sep_data = NULL;
10022 PyObject *last_obj;
10023 unsigned int kind = 0;
10024
Tim Peters05eba1f2004-08-27 21:32:02 +000010025 /* If empty sequence, return u"". */
10026 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010027 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010028 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010029
Tim Peters05eba1f2004-08-27 21:32:02 +000010030 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010032 if (seqlen == 1) {
10033 if (PyUnicode_CheckExact(items[0])) {
10034 res = items[0];
10035 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010036 return res;
10037 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010039 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010040 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010041 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010042 /* Set up sep and seplen */
10043 if (separator == NULL) {
10044 /* fall back to a blank space separator */
10045 sep = PyUnicode_FromOrdinal(' ');
10046 if (!sep)
10047 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010048 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010049 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010050 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010051 else {
10052 if (!PyUnicode_Check(separator)) {
10053 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010054 "separator: expected str instance,"
10055 " %.80s found",
10056 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010057 goto onError;
10058 }
10059 if (PyUnicode_READY(separator))
10060 goto onError;
10061 sep = separator;
10062 seplen = PyUnicode_GET_LENGTH(separator);
10063 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10064 /* inc refcount to keep this code path symmetric with the
10065 above case of a blank separator */
10066 Py_INCREF(sep);
10067 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010068 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010069 }
10070
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010071 /* There are at least two things to join, or else we have a subclass
10072 * of str in the sequence.
10073 * Do a pre-pass to figure out the total amount of space we'll
10074 * need (sz), and see whether all argument are strings.
10075 */
10076 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010077#ifdef Py_DEBUG
10078 use_memcpy = 0;
10079#else
10080 use_memcpy = 1;
10081#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010082 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010083 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010084 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 if (!PyUnicode_Check(item)) {
10086 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010087 "sequence item %zd: expected str instance,"
10088 " %.80s found",
10089 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 goto onError;
10091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 if (PyUnicode_READY(item) == -1)
10093 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010094 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010096 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010097 if (i != 0) {
10098 add_sz += seplen;
10099 }
10100 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010101 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010103 goto onError;
10104 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010105 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010106 if (use_memcpy && last_obj != NULL) {
10107 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10108 use_memcpy = 0;
10109 }
10110 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010111 }
Tim Petersced69f82003-09-16 20:30:58 +000010112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010114 if (res == NULL)
10115 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010116
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010117 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010118#ifdef Py_DEBUG
10119 use_memcpy = 0;
10120#else
10121 if (use_memcpy) {
10122 res_data = PyUnicode_1BYTE_DATA(res);
10123 kind = PyUnicode_KIND(res);
10124 if (seplen != 0)
10125 sep_data = PyUnicode_1BYTE_DATA(sep);
10126 }
10127#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010128 if (use_memcpy) {
10129 for (i = 0; i < seqlen; ++i) {
10130 Py_ssize_t itemlen;
10131 item = items[i];
10132
10133 /* Copy item, and maybe the separator. */
10134 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010135 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010136 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010137 kind * seplen);
10138 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010139 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010140
10141 itemlen = PyUnicode_GET_LENGTH(item);
10142 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010143 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010144 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 kind * itemlen);
10146 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010147 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010148 }
10149 assert(res_data == PyUnicode_1BYTE_DATA(res)
10150 + kind * PyUnicode_GET_LENGTH(res));
10151 }
10152 else {
10153 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10154 Py_ssize_t itemlen;
10155 item = items[i];
10156
10157 /* Copy item, and maybe the separator. */
10158 if (i && seplen != 0) {
10159 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10160 res_offset += seplen;
10161 }
10162
10163 itemlen = PyUnicode_GET_LENGTH(item);
10164 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010165 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010166 res_offset += itemlen;
10167 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010168 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010169 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010170 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010173 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
Benjamin Peterson29060642009-01-31 22:14:21 +000010176 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010178 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179 return NULL;
10180}
10181
Victor Stinnerd3f08822012-05-29 12:57:52 +020010182void
10183_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10184 Py_UCS4 fill_char)
10185{
10186 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010187 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010188 assert(PyUnicode_IS_READY(unicode));
10189 assert(unicode_modifiable(unicode));
10190 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10191 assert(start >= 0);
10192 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010193 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010194}
10195
Victor Stinner3fe55312012-01-04 00:33:50 +010010196Py_ssize_t
10197PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10198 Py_UCS4 fill_char)
10199{
10200 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010201
10202 if (!PyUnicode_Check(unicode)) {
10203 PyErr_BadInternalCall();
10204 return -1;
10205 }
10206 if (PyUnicode_READY(unicode) == -1)
10207 return -1;
10208 if (unicode_check_modifiable(unicode))
10209 return -1;
10210
Victor Stinnerd3f08822012-05-29 12:57:52 +020010211 if (start < 0) {
10212 PyErr_SetString(PyExc_IndexError, "string index out of range");
10213 return -1;
10214 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010215 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10216 PyErr_SetString(PyExc_ValueError,
10217 "fill character is bigger than "
10218 "the string maximum character");
10219 return -1;
10220 }
10221
10222 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10223 length = Py_MIN(maxlen, length);
10224 if (length <= 0)
10225 return 0;
10226
Victor Stinnerd3f08822012-05-29 12:57:52 +020010227 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010228 return length;
10229}
10230
Victor Stinner9310abb2011-10-05 00:59:23 +020010231static PyObject *
10232pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010233 Py_ssize_t left,
10234 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 PyObject *u;
10238 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010239 int kind;
10240 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241
10242 if (left < 0)
10243 left = 0;
10244 if (right < 0)
10245 right = 0;
10246
Victor Stinnerc4b49542011-12-11 22:44:26 +010010247 if (left == 0 && right == 0)
10248 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10251 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010252 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10253 return NULL;
10254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010256 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010258 if (!u)
10259 return NULL;
10260
10261 kind = PyUnicode_KIND(u);
10262 data = PyUnicode_DATA(u);
10263 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010264 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010265 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010266 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010267 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010268 assert(_PyUnicode_CheckConsistency(u, 1));
10269 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270}
10271
Alexander Belopolsky40018472011-02-26 01:02:56 +000010272PyObject *
10273PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010277 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279
Benjamin Petersonead6b532011-12-20 17:23:42 -060010280 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010282 if (PyUnicode_IS_ASCII(string))
10283 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010285 PyUnicode_GET_LENGTH(string), keepends);
10286 else
10287 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010288 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010289 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 break;
10291 case PyUnicode_2BYTE_KIND:
10292 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 PyUnicode_GET_LENGTH(string), keepends);
10295 break;
10296 case PyUnicode_4BYTE_KIND:
10297 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010298 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 PyUnicode_GET_LENGTH(string), keepends);
10300 break;
10301 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010302 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305}
10306
Alexander Belopolsky40018472011-02-26 01:02:56 +000010307static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010308split(PyObject *self,
10309 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010310 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010312 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 void *buf1, *buf2;
10314 Py_ssize_t len1, len2;
10315 PyObject* out;
10316
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010318 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (PyUnicode_READY(self) == -1)
10321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010324 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010326 if (PyUnicode_IS_ASCII(self))
10327 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010328 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 PyUnicode_GET_LENGTH(self), maxcount
10330 );
10331 else
10332 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010333 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010334 PyUnicode_GET_LENGTH(self), maxcount
10335 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 case PyUnicode_2BYTE_KIND:
10337 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 PyUnicode_GET_LENGTH(self), maxcount
10340 );
10341 case PyUnicode_4BYTE_KIND:
10342 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 PyUnicode_GET_LENGTH(self), maxcount
10345 );
10346 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010347 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 }
10349
10350 if (PyUnicode_READY(substring) == -1)
10351 return NULL;
10352
10353 kind1 = PyUnicode_KIND(self);
10354 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 len1 = PyUnicode_GET_LENGTH(self);
10356 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010357 if (kind1 < kind2 || len1 < len2) {
10358 out = PyList_New(1);
10359 if (out == NULL)
10360 return NULL;
10361 Py_INCREF(self);
10362 PyList_SET_ITEM(out, 0, self);
10363 return out;
10364 }
10365 buf1 = PyUnicode_DATA(self);
10366 buf2 = PyUnicode_DATA(substring);
10367 if (kind2 != kind1) {
10368 buf2 = _PyUnicode_AsKind(substring, kind1);
10369 if (!buf2)
10370 return NULL;
10371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010373 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10376 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010377 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010378 else
10379 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 break;
10382 case PyUnicode_2BYTE_KIND:
10383 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 case PyUnicode_4BYTE_KIND:
10387 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 break;
10390 default:
10391 out = NULL;
10392 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010393 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 PyMem_Free(buf2);
10395 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396}
10397
Alexander Belopolsky40018472011-02-26 01:02:56 +000010398static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010399rsplit(PyObject *self,
10400 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010401 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010402{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010403 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 void *buf1, *buf2;
10405 Py_ssize_t len1, len2;
10406 PyObject* out;
10407
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010408 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010409 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 if (PyUnicode_READY(self) == -1)
10412 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010415 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417 if (PyUnicode_IS_ASCII(self))
10418 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010419 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420 PyUnicode_GET_LENGTH(self), maxcount
10421 );
10422 else
10423 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010424 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010425 PyUnicode_GET_LENGTH(self), maxcount
10426 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 case PyUnicode_2BYTE_KIND:
10428 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010429 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 PyUnicode_GET_LENGTH(self), maxcount
10431 );
10432 case PyUnicode_4BYTE_KIND:
10433 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 PyUnicode_GET_LENGTH(self), maxcount
10436 );
10437 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010438 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 }
10440
10441 if (PyUnicode_READY(substring) == -1)
10442 return NULL;
10443
10444 kind1 = PyUnicode_KIND(self);
10445 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 len1 = PyUnicode_GET_LENGTH(self);
10447 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010448 if (kind1 < kind2 || len1 < len2) {
10449 out = PyList_New(1);
10450 if (out == NULL)
10451 return NULL;
10452 Py_INCREF(self);
10453 PyList_SET_ITEM(out, 0, self);
10454 return out;
10455 }
10456 buf1 = PyUnicode_DATA(self);
10457 buf2 = PyUnicode_DATA(substring);
10458 if (kind2 != kind1) {
10459 buf2 = _PyUnicode_AsKind(substring, kind1);
10460 if (!buf2)
10461 return NULL;
10462 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010464 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010466 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10467 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010468 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010469 else
10470 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010471 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 break;
10473 case PyUnicode_2BYTE_KIND:
10474 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010475 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 break;
10477 case PyUnicode_4BYTE_KIND:
10478 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010479 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 break;
10481 default:
10482 out = NULL;
10483 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010484 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 PyMem_Free(buf2);
10486 return out;
10487}
10488
10489static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010490anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10491 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010493 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010495 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10496 return asciilib_find(buf1, len1, buf2, len2, offset);
10497 else
10498 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 case PyUnicode_2BYTE_KIND:
10500 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10501 case PyUnicode_4BYTE_KIND:
10502 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10503 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010504 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505}
10506
10507static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010508anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10509 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010511 switch (kind) {
10512 case PyUnicode_1BYTE_KIND:
10513 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10514 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10515 else
10516 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10517 case PyUnicode_2BYTE_KIND:
10518 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10519 case PyUnicode_4BYTE_KIND:
10520 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10521 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010522 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010523}
10524
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010525static void
10526replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10527 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10528{
10529 int kind = PyUnicode_KIND(u);
10530 void *data = PyUnicode_DATA(u);
10531 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10532 if (kind == PyUnicode_1BYTE_KIND) {
10533 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10534 (Py_UCS1 *)data + len,
10535 u1, u2, maxcount);
10536 }
10537 else if (kind == PyUnicode_2BYTE_KIND) {
10538 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10539 (Py_UCS2 *)data + len,
10540 u1, u2, maxcount);
10541 }
10542 else {
10543 assert(kind == PyUnicode_4BYTE_KIND);
10544 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10545 (Py_UCS4 *)data + len,
10546 u1, u2, maxcount);
10547 }
10548}
10549
Alexander Belopolsky40018472011-02-26 01:02:56 +000010550static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551replace(PyObject *self, PyObject *str1,
10552 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 PyObject *u;
10555 char *sbuf = PyUnicode_DATA(self);
10556 char *buf1 = PyUnicode_DATA(str1);
10557 char *buf2 = PyUnicode_DATA(str2);
10558 int srelease = 0, release1 = 0, release2 = 0;
10559 int skind = PyUnicode_KIND(self);
10560 int kind1 = PyUnicode_KIND(str1);
10561 int kind2 = PyUnicode_KIND(str2);
10562 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10563 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10564 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010566 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567
10568 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010571 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572
Victor Stinner59de0ee2011-10-07 10:01:28 +020010573 if (str1 == str2)
10574 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010577 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10578 if (maxchar < maxchar_str1)
10579 /* substring too wide to be present */
10580 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10582 /* Replacing str1 with str2 may cause a maxchar reduction in the
10583 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010584 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010585 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010588 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010593 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010594 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010595
Victor Stinner69ed0f42013-04-09 21:48:24 +020010596 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010597 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010598 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010600 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010604
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010605 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10606 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 }
10608 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 int rkind = skind;
10610 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010611 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (kind1 < rkind) {
10614 /* widen substring */
10615 buf1 = _PyUnicode_AsKind(str1, rkind);
10616 if (!buf1) goto error;
10617 release1 = 1;
10618 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010619 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 if (i < 0)
10621 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (rkind > kind2) {
10623 /* widen replacement */
10624 buf2 = _PyUnicode_AsKind(str2, rkind);
10625 if (!buf2) goto error;
10626 release2 = 1;
10627 }
10628 else if (rkind < kind2) {
10629 /* widen self and buf1 */
10630 rkind = kind2;
10631 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010632 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 sbuf = _PyUnicode_AsKind(self, rkind);
10634 if (!sbuf) goto error;
10635 srelease = 1;
10636 buf1 = _PyUnicode_AsKind(str1, rkind);
10637 if (!buf1) goto error;
10638 release1 = 1;
10639 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 u = PyUnicode_New(slen, maxchar);
10641 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 assert(PyUnicode_KIND(u) == rkind);
10644 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010645
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010646 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010647 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010650 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010652
10653 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010654 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010656 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010657 if (i == -1)
10658 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010665 }
10666 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010668 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 int rkind = skind;
10670 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010673 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf1 = _PyUnicode_AsKind(str1, rkind);
10675 if (!buf1) goto error;
10676 release1 = 1;
10677 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010678 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 if (n == 0)
10680 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010682 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 buf2 = _PyUnicode_AsKind(str2, rkind);
10684 if (!buf2) goto error;
10685 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010688 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 rkind = kind2;
10690 sbuf = _PyUnicode_AsKind(self, rkind);
10691 if (!sbuf) goto error;
10692 srelease = 1;
10693 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010694 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 buf1 = _PyUnicode_AsKind(str1, rkind);
10696 if (!buf1) goto error;
10697 release1 = 1;
10698 }
10699 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10700 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010701 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 PyErr_SetString(PyExc_OverflowError,
10703 "replace string is too long");
10704 goto error;
10705 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010706 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010707 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010708 _Py_INCREF_UNICODE_EMPTY();
10709 if (!unicode_empty)
10710 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010711 u = unicode_empty;
10712 goto done;
10713 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010714 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 PyErr_SetString(PyExc_OverflowError,
10716 "replace string is too long");
10717 goto error;
10718 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010719 u = PyUnicode_New(new_size, maxchar);
10720 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010722 assert(PyUnicode_KIND(u) == rkind);
10723 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 ires = i = 0;
10725 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 while (n-- > 0) {
10727 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010728 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010729 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010730 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010731 if (j == -1)
10732 break;
10733 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010734 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010735 memcpy(res + rkind * ires,
10736 sbuf + rkind * i,
10737 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010739 }
10740 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010742 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010744 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010751 memcpy(res + rkind * ires,
10752 sbuf + rkind * i,
10753 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010754 }
10755 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010756 /* interleave */
10757 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010758 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010760 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762 if (--n <= 0)
10763 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010764 memcpy(res + rkind * ires,
10765 sbuf + rkind * i,
10766 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 ires++;
10768 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010769 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010770 memcpy(res + rkind * ires,
10771 sbuf + rkind * i,
10772 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010773 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010774 }
10775
10776 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010777 unicode_adjust_maxchar(&u);
10778 if (u == NULL)
10779 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010781
10782 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (srelease)
10784 PyMem_FREE(sbuf);
10785 if (release1)
10786 PyMem_FREE(buf1);
10787 if (release2)
10788 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010789 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010791
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010793 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if (srelease)
10795 PyMem_FREE(sbuf);
10796 if (release1)
10797 PyMem_FREE(buf1);
10798 if (release2)
10799 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010800 return unicode_result_unchanged(self);
10801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 error:
10803 if (srelease && sbuf)
10804 PyMem_FREE(sbuf);
10805 if (release1 && buf1)
10806 PyMem_FREE(buf1);
10807 if (release2 && buf2)
10808 PyMem_FREE(buf2);
10809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810}
10811
10812/* --- Unicode Object Methods --------------------------------------------- */
10813
INADA Naoki3ae20562017-01-16 20:41:20 +090010814/*[clinic input]
10815str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
INADA Naoki3ae20562017-01-16 20:41:20 +090010817Return a version of the string where each word is titlecased.
10818
10819More specifically, words start with uppercased characters and all remaining
10820cased characters have lower case.
10821[clinic start generated code]*/
10822
10823static PyObject *
10824unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010825/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010827 if (PyUnicode_READY(self) == -1)
10828 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010829 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830}
10831
INADA Naoki3ae20562017-01-16 20:41:20 +090010832/*[clinic input]
10833str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
INADA Naoki3ae20562017-01-16 20:41:20 +090010835Return a capitalized version of the string.
10836
10837More specifically, make the first character have upper case and the rest lower
10838case.
10839[clinic start generated code]*/
10840
10841static PyObject *
10842unicode_capitalize_impl(PyObject *self)
10843/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010845 if (PyUnicode_READY(self) == -1)
10846 return NULL;
10847 if (PyUnicode_GET_LENGTH(self) == 0)
10848 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010849 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850}
10851
INADA Naoki3ae20562017-01-16 20:41:20 +090010852/*[clinic input]
10853str.casefold as unicode_casefold
10854
10855Return a version of the string suitable for caseless comparisons.
10856[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010857
10858static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010859unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010860/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010861{
10862 if (PyUnicode_READY(self) == -1)
10863 return NULL;
10864 if (PyUnicode_IS_ASCII(self))
10865 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010866 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010867}
10868
10869
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010870/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010871
10872static int
10873convert_uc(PyObject *obj, void *addr)
10874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010876
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010877 if (!PyUnicode_Check(obj)) {
10878 PyErr_Format(PyExc_TypeError,
10879 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010880 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010881 return 0;
10882 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010883 if (PyUnicode_READY(obj) < 0)
10884 return 0;
10885 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888 return 0;
10889 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010890 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010891 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010892}
10893
INADA Naoki3ae20562017-01-16 20:41:20 +090010894/*[clinic input]
10895str.center as unicode_center
10896
10897 width: Py_ssize_t
10898 fillchar: Py_UCS4 = ' '
10899 /
10900
10901Return a centered string of length width.
10902
10903Padding is done using the specified fill character (default is a space).
10904[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905
10906static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010907unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10908/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010910 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
Benjamin Petersonbac79492012-01-14 13:34:47 -050010912 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913 return NULL;
10914
Victor Stinnerc4b49542011-12-11 22:44:26 +010010915 if (PyUnicode_GET_LENGTH(self) >= width)
10916 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917
Victor Stinnerc4b49542011-12-11 22:44:26 +010010918 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919 left = marg / 2 + (marg & width & 1);
10920
Victor Stinner9310abb2011-10-05 00:59:23 +020010921 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922}
10923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924/* This function assumes that str1 and str2 are readied by the caller. */
10925
Marc-André Lemburge5034372000-08-08 08:04:29 +000010926static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010927unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010928{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010929#define COMPARE(TYPE1, TYPE2) \
10930 do { \
10931 TYPE1* p1 = (TYPE1 *)data1; \
10932 TYPE2* p2 = (TYPE2 *)data2; \
10933 TYPE1* end = p1 + len; \
10934 Py_UCS4 c1, c2; \
10935 for (; p1 != end; p1++, p2++) { \
10936 c1 = *p1; \
10937 c2 = *p2; \
10938 if (c1 != c2) \
10939 return (c1 < c2) ? -1 : 1; \
10940 } \
10941 } \
10942 while (0)
10943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 int kind1, kind2;
10945 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010946 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 kind1 = PyUnicode_KIND(str1);
10949 kind2 = PyUnicode_KIND(str2);
10950 data1 = PyUnicode_DATA(str1);
10951 data2 = PyUnicode_DATA(str2);
10952 len1 = PyUnicode_GET_LENGTH(str1);
10953 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010954 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010955
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010956 switch(kind1) {
10957 case PyUnicode_1BYTE_KIND:
10958 {
10959 switch(kind2) {
10960 case PyUnicode_1BYTE_KIND:
10961 {
10962 int cmp = memcmp(data1, data2, len);
10963 /* normalize result of memcmp() into the range [-1; 1] */
10964 if (cmp < 0)
10965 return -1;
10966 if (cmp > 0)
10967 return 1;
10968 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010969 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010970 case PyUnicode_2BYTE_KIND:
10971 COMPARE(Py_UCS1, Py_UCS2);
10972 break;
10973 case PyUnicode_4BYTE_KIND:
10974 COMPARE(Py_UCS1, Py_UCS4);
10975 break;
10976 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010977 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010978 }
10979 break;
10980 }
10981 case PyUnicode_2BYTE_KIND:
10982 {
10983 switch(kind2) {
10984 case PyUnicode_1BYTE_KIND:
10985 COMPARE(Py_UCS2, Py_UCS1);
10986 break;
10987 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010988 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010989 COMPARE(Py_UCS2, Py_UCS2);
10990 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010991 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010992 case PyUnicode_4BYTE_KIND:
10993 COMPARE(Py_UCS2, Py_UCS4);
10994 break;
10995 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010996 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010997 }
10998 break;
10999 }
11000 case PyUnicode_4BYTE_KIND:
11001 {
11002 switch(kind2) {
11003 case PyUnicode_1BYTE_KIND:
11004 COMPARE(Py_UCS4, Py_UCS1);
11005 break;
11006 case PyUnicode_2BYTE_KIND:
11007 COMPARE(Py_UCS4, Py_UCS2);
11008 break;
11009 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011010 {
11011#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11012 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11013 /* normalize result of wmemcmp() into the range [-1; 1] */
11014 if (cmp < 0)
11015 return -1;
11016 if (cmp > 0)
11017 return 1;
11018#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011019 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011020#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011021 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011022 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011023 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011024 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011025 }
11026 break;
11027 }
11028 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011029 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011030 }
11031
Victor Stinner770e19e2012-10-04 22:59:45 +020011032 if (len1 == len2)
11033 return 0;
11034 if (len1 < len2)
11035 return -1;
11036 else
11037 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011038
11039#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011040}
11041
Benjamin Peterson621b4302016-09-09 13:54:34 -070011042static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011043unicode_compare_eq(PyObject *str1, PyObject *str2)
11044{
11045 int kind;
11046 void *data1, *data2;
11047 Py_ssize_t len;
11048 int cmp;
11049
Victor Stinnere5567ad2012-10-23 02:48:49 +020011050 len = PyUnicode_GET_LENGTH(str1);
11051 if (PyUnicode_GET_LENGTH(str2) != len)
11052 return 0;
11053 kind = PyUnicode_KIND(str1);
11054 if (PyUnicode_KIND(str2) != kind)
11055 return 0;
11056 data1 = PyUnicode_DATA(str1);
11057 data2 = PyUnicode_DATA(str2);
11058
11059 cmp = memcmp(data1, data2, len * kind);
11060 return (cmp == 0);
11061}
11062
11063
Alexander Belopolsky40018472011-02-26 01:02:56 +000011064int
11065PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11068 if (PyUnicode_READY(left) == -1 ||
11069 PyUnicode_READY(right) == -1)
11070 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011071
11072 /* a string is equal to itself */
11073 if (left == right)
11074 return 0;
11075
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011076 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011078 PyErr_Format(PyExc_TypeError,
11079 "Can't compare %.100s and %.100s",
11080 left->ob_type->tp_name,
11081 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 return -1;
11083}
11084
Martin v. Löwis5b222132007-06-10 09:51:05 +000011085int
11086PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 Py_ssize_t i;
11089 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011091 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092
Victor Stinner910337b2011-10-03 03:20:16 +020011093 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011094 if (!PyUnicode_IS_READY(uni)) {
11095 const wchar_t *ws = _PyUnicode_WSTR(uni);
11096 /* Compare Unicode string and source character set string */
11097 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11098 if (chr != ustr[i])
11099 return (chr < ustr[i]) ? -1 : 1;
11100 }
11101 /* This check keeps Python strings that end in '\0' from comparing equal
11102 to C strings identical up to that point. */
11103 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11104 return 1; /* uni is longer */
11105 if (ustr[i])
11106 return -1; /* str is longer */
11107 return 0;
11108 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011109 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011110 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011111 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011112 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011113 size_t len, len2 = strlen(str);
11114 int cmp;
11115
11116 len = Py_MIN(len1, len2);
11117 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011118 if (cmp != 0) {
11119 if (cmp < 0)
11120 return -1;
11121 else
11122 return 1;
11123 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011124 if (len1 > len2)
11125 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011126 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011127 return -1; /* str is longer */
11128 return 0;
11129 }
11130 else {
11131 void *data = PyUnicode_DATA(uni);
11132 /* Compare Unicode string and source character set string */
11133 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011134 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011135 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11136 /* This check keeps Python strings that end in '\0' from comparing equal
11137 to C strings identical up to that point. */
11138 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11139 return 1; /* uni is longer */
11140 if (str[i])
11141 return -1; /* str is longer */
11142 return 0;
11143 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011144}
11145
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011146static int
11147non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11148{
11149 size_t i, len;
11150 const wchar_t *p;
11151 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11152 if (strlen(str) != len)
11153 return 0;
11154 p = _PyUnicode_WSTR(unicode);
11155 assert(p);
11156 for (i = 0; i < len; i++) {
11157 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011158 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011159 return 0;
11160 }
11161 return 1;
11162}
11163
11164int
11165_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11166{
11167 size_t len;
11168 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011169 assert(str);
11170#ifndef NDEBUG
11171 for (const char *p = str; *p; p++) {
11172 assert((unsigned char)*p < 128);
11173 }
11174#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011175 if (PyUnicode_READY(unicode) == -1) {
11176 /* Memory error or bad data */
11177 PyErr_Clear();
11178 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11179 }
11180 if (!PyUnicode_IS_ASCII(unicode))
11181 return 0;
11182 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11183 return strlen(str) == len &&
11184 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11185}
11186
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011187int
11188_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11189{
11190 PyObject *right_uni;
11191 Py_hash_t hash;
11192
11193 assert(_PyUnicode_CHECK(left));
11194 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011195#ifndef NDEBUG
11196 for (const char *p = right->string; *p; p++) {
11197 assert((unsigned char)*p < 128);
11198 }
11199#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011200
11201 if (PyUnicode_READY(left) == -1) {
11202 /* memory error or bad data */
11203 PyErr_Clear();
11204 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11205 }
11206
11207 if (!PyUnicode_IS_ASCII(left))
11208 return 0;
11209
11210 right_uni = _PyUnicode_FromId(right); /* borrowed */
11211 if (right_uni == NULL) {
11212 /* memory error or bad data */
11213 PyErr_Clear();
11214 return _PyUnicode_EqualToASCIIString(left, right->string);
11215 }
11216
11217 if (left == right_uni)
11218 return 1;
11219
11220 if (PyUnicode_CHECK_INTERNED(left))
11221 return 0;
11222
INADA Naoki7cc95f52018-01-28 02:07:09 +090011223 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011224 hash = _PyUnicode_HASH(left);
11225 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11226 return 0;
11227
11228 return unicode_compare_eq(left, right_uni);
11229}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011230
Alexander Belopolsky40018472011-02-26 01:02:56 +000011231PyObject *
11232PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011233{
11234 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011235
Victor Stinnere5567ad2012-10-23 02:48:49 +020011236 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11237 Py_RETURN_NOTIMPLEMENTED;
11238
11239 if (PyUnicode_READY(left) == -1 ||
11240 PyUnicode_READY(right) == -1)
11241 return NULL;
11242
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011243 if (left == right) {
11244 switch (op) {
11245 case Py_EQ:
11246 case Py_LE:
11247 case Py_GE:
11248 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011249 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011250 case Py_NE:
11251 case Py_LT:
11252 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011253 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011254 default:
11255 PyErr_BadArgument();
11256 return NULL;
11257 }
11258 }
11259 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011260 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011261 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011262 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011263 }
11264 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011265 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011266 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011267 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011268}
11269
Alexander Belopolsky40018472011-02-26 01:02:56 +000011270int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011271_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11272{
11273 return unicode_eq(aa, bb);
11274}
11275
11276int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011277PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011278{
Victor Stinner77282cb2013-04-14 19:22:47 +020011279 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 void *buf1, *buf2;
11281 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011282 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011283
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011284 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011286 "'in <string>' requires string as left operand, not %.100s",
11287 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011288 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011289 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011290 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011291 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011292 if (ensure_unicode(str) < 0)
11293 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 kind2 = PyUnicode_KIND(substr);
11297 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011298 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011300 len2 = PyUnicode_GET_LENGTH(substr);
11301 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011302 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011303 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011304 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011305 if (len2 == 1) {
11306 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11307 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011308 return result;
11309 }
11310 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011311 buf2 = _PyUnicode_AsKind(substr, kind1);
11312 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011313 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315
Victor Stinner77282cb2013-04-14 19:22:47 +020011316 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 case PyUnicode_1BYTE_KIND:
11318 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11319 break;
11320 case PyUnicode_2BYTE_KIND:
11321 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11322 break;
11323 case PyUnicode_4BYTE_KIND:
11324 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11325 break;
11326 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011327 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011329
Victor Stinner77282cb2013-04-14 19:22:47 +020011330 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 PyMem_Free(buf2);
11332
Guido van Rossum403d68b2000-03-13 15:55:09 +000011333 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011334}
11335
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336/* Concat to string or Unicode object giving a new Unicode object. */
11337
Alexander Belopolsky40018472011-02-26 01:02:56 +000011338PyObject *
11339PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011341 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011342 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011343 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011345 if (ensure_unicode(left) < 0)
11346 return NULL;
11347
11348 if (!PyUnicode_Check(right)) {
11349 PyErr_Format(PyExc_TypeError,
11350 "can only concatenate str (not \"%.200s\") to str",
11351 right->ob_type->tp_name);
11352 return NULL;
11353 }
11354 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
11357 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011358 if (left == unicode_empty)
11359 return PyUnicode_FromObject(right);
11360 if (right == unicode_empty)
11361 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011363 left_len = PyUnicode_GET_LENGTH(left);
11364 right_len = PyUnicode_GET_LENGTH(right);
11365 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011366 PyErr_SetString(PyExc_OverflowError,
11367 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011369 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011370 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011371
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011372 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11373 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011374 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 result = PyUnicode_New(new_len, maxchar);
11378 if (result == NULL)
11379 return NULL;
11380 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11381 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11382 assert(_PyUnicode_CheckConsistency(result, 1));
11383 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384}
11385
Walter Dörwald1ab83302007-05-18 17:15:44 +000011386void
Victor Stinner23e56682011-10-03 03:54:37 +020011387PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011388{
Victor Stinner23e56682011-10-03 03:54:37 +020011389 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011390 Py_UCS4 maxchar, maxchar2;
11391 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011392
11393 if (p_left == NULL) {
11394 if (!PyErr_Occurred())
11395 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011396 return;
11397 }
Victor Stinner23e56682011-10-03 03:54:37 +020011398 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011399 if (right == NULL || left == NULL
11400 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011401 if (!PyErr_Occurred())
11402 PyErr_BadInternalCall();
11403 goto error;
11404 }
11405
Benjamin Petersonbac79492012-01-14 13:34:47 -050011406 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011407 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011408 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011409 goto error;
11410
Victor Stinner488fa492011-12-12 00:01:39 +010011411 /* Shortcuts */
11412 if (left == unicode_empty) {
11413 Py_DECREF(left);
11414 Py_INCREF(right);
11415 *p_left = right;
11416 return;
11417 }
11418 if (right == unicode_empty)
11419 return;
11420
11421 left_len = PyUnicode_GET_LENGTH(left);
11422 right_len = PyUnicode_GET_LENGTH(right);
11423 if (left_len > PY_SSIZE_T_MAX - right_len) {
11424 PyErr_SetString(PyExc_OverflowError,
11425 "strings are too large to concat");
11426 goto error;
11427 }
11428 new_len = left_len + right_len;
11429
11430 if (unicode_modifiable(left)
11431 && PyUnicode_CheckExact(right)
11432 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011433 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11434 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011435 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011436 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011437 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11438 {
11439 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011440 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011441 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011442
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011443 /* copy 'right' into the newly allocated area of 'left' */
11444 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011445 }
Victor Stinner488fa492011-12-12 00:01:39 +010011446 else {
11447 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11448 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011449 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011450
Victor Stinner488fa492011-12-12 00:01:39 +010011451 /* Concat the two Unicode strings */
11452 res = PyUnicode_New(new_len, maxchar);
11453 if (res == NULL)
11454 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011455 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11456 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011457 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011458 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011459 }
11460 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011461 return;
11462
11463error:
Victor Stinner488fa492011-12-12 00:01:39 +010011464 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011465}
11466
11467void
11468PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11469{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011470 PyUnicode_Append(pleft, right);
11471 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011472}
11473
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011474/*
11475Wraps stringlib_parse_args_finds() and additionally ensures that the
11476first argument is a unicode object.
11477*/
11478
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011479static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011480parse_args_finds_unicode(const char * function_name, PyObject *args,
11481 PyObject **substring,
11482 Py_ssize_t *start, Py_ssize_t *end)
11483{
11484 if(stringlib_parse_args_finds(function_name, args, substring,
11485 start, end)) {
11486 if (ensure_unicode(*substring) < 0)
11487 return 0;
11488 return 1;
11489 }
11490 return 0;
11491}
11492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011496Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011497string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499
11500static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011501unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011503 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011504 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011505 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011507 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 void *buf1, *buf2;
11509 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011511 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 kind1 = PyUnicode_KIND(self);
11515 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011516 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011517 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 len1 = PyUnicode_GET_LENGTH(self);
11520 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011522 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011523 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011524
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011525 buf1 = PyUnicode_DATA(self);
11526 buf2 = PyUnicode_DATA(substring);
11527 if (kind2 != kind1) {
11528 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011529 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011530 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011531 }
11532 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 case PyUnicode_1BYTE_KIND:
11534 iresult = ucs1lib_count(
11535 ((Py_UCS1*)buf1) + start, end - start,
11536 buf2, len2, PY_SSIZE_T_MAX
11537 );
11538 break;
11539 case PyUnicode_2BYTE_KIND:
11540 iresult = ucs2lib_count(
11541 ((Py_UCS2*)buf1) + start, end - start,
11542 buf2, len2, PY_SSIZE_T_MAX
11543 );
11544 break;
11545 case PyUnicode_4BYTE_KIND:
11546 iresult = ucs4lib_count(
11547 ((Py_UCS4*)buf1) + start, end - start,
11548 buf2, len2, PY_SSIZE_T_MAX
11549 );
11550 break;
11551 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011552 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 }
11554
11555 result = PyLong_FromSsize_t(iresult);
11556
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011557 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 return result;
11561}
11562
INADA Naoki3ae20562017-01-16 20:41:20 +090011563/*[clinic input]
11564str.encode as unicode_encode
11565
11566 encoding: str(c_default="NULL") = 'utf-8'
11567 The encoding in which to encode the string.
11568 errors: str(c_default="NULL") = 'strict'
11569 The error handling scheme to use for encoding errors.
11570 The default is 'strict' meaning that encoding errors raise a
11571 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11572 'xmlcharrefreplace' as well as any other name registered with
11573 codecs.register_error that can handle UnicodeEncodeErrors.
11574
11575Encode the string using the codec registered for encoding.
11576[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
11578static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011579unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011580/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011582 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011583}
11584
INADA Naoki3ae20562017-01-16 20:41:20 +090011585/*[clinic input]
11586str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
INADA Naoki3ae20562017-01-16 20:41:20 +090011588 tabsize: int = 8
11589
11590Return a copy where all tab characters are expanded using spaces.
11591
11592If tabsize is not given, a tab size of 8 characters is assumed.
11593[clinic start generated code]*/
11594
11595static PyObject *
11596unicode_expandtabs_impl(PyObject *self, int tabsize)
11597/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 Py_ssize_t i, j, line_pos, src_len, incr;
11600 Py_UCS4 ch;
11601 PyObject *u;
11602 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011603 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011604 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Antoine Pitrou22425222011-10-04 19:10:51 +020011606 if (PyUnicode_READY(self) == -1)
11607 return NULL;
11608
Thomas Wouters7e474022000-07-16 12:04:32 +000011609 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011610 src_len = PyUnicode_GET_LENGTH(self);
11611 i = j = line_pos = 0;
11612 kind = PyUnicode_KIND(self);
11613 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011614 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011615 for (; i < src_len; i++) {
11616 ch = PyUnicode_READ(kind, src_data, i);
11617 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011618 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011621 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011622 goto overflow;
11623 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011625 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011629 goto overflow;
11630 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011632 if (ch == '\n' || ch == '\r')
11633 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011635 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011636 if (!found)
11637 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011638
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011640 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641 if (!u)
11642 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011643 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
Antoine Pitroue71d5742011-10-04 15:55:09 +020011645 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
Antoine Pitroue71d5742011-10-04 15:55:09 +020011647 for (; i < src_len; i++) {
11648 ch = PyUnicode_READ(kind, src_data, i);
11649 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011650 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011651 incr = tabsize - (line_pos % tabsize);
11652 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011653 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011654 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011656 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011658 line_pos++;
11659 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011660 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011661 if (ch == '\n' || ch == '\r')
11662 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011664 }
11665 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011666 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011667
Antoine Pitroue71d5742011-10-04 15:55:09 +020011668 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011669 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671}
11672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011673PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011674 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675\n\
11676Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011677such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678arguments start and end are interpreted as in slice notation.\n\
11679\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011680Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681
11682static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011685 /* initialize variables to prevent gcc warning */
11686 PyObject *substring = NULL;
11687 Py_ssize_t start = 0;
11688 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011689 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011691 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011694 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011697 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (result == -2)
11700 return NULL;
11701
Christian Heimes217cfd12007-12-02 14:31:20 +000011702 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703}
11704
11705static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011706unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011708 void *data;
11709 enum PyUnicode_Kind kind;
11710 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011711
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011712 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011713 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011715 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011716 if (PyUnicode_READY(self) == -1) {
11717 return NULL;
11718 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011719 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11720 PyErr_SetString(PyExc_IndexError, "string index out of range");
11721 return NULL;
11722 }
11723 kind = PyUnicode_KIND(self);
11724 data = PyUnicode_DATA(self);
11725 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011726 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727}
11728
Guido van Rossumc2504932007-09-18 19:42:40 +000011729/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011730 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011731static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011734 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011735
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011736#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011737 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011738#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (_PyUnicode_HASH(self) != -1)
11740 return _PyUnicode_HASH(self);
11741 if (PyUnicode_READY(self) == -1)
11742 return -1;
animalizea1d14252019-01-02 20:16:06 +080011743
Christian Heimes985ecdc2013-11-20 11:46:18 +010011744 x = _Py_HashBytes(PyUnicode_DATA(self),
11745 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011747 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748}
11749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011750PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752\n\
oldkaa0735f2018-02-02 16:52:55 +080011753Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011754such that sub is contained within S[start:end]. Optional\n\
11755arguments start and end are interpreted as in slice notation.\n\
11756\n\
11757Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
11759static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011762 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011763 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011764 PyObject *substring = NULL;
11765 Py_ssize_t start = 0;
11766 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011768 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011771 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011774 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 if (result == -2)
11777 return NULL;
11778
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 if (result < 0) {
11780 PyErr_SetString(PyExc_ValueError, "substring not found");
11781 return NULL;
11782 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011783
Christian Heimes217cfd12007-12-02 14:31:20 +000011784 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785}
11786
INADA Naoki3ae20562017-01-16 20:41:20 +090011787/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011788str.isascii as unicode_isascii
11789
11790Return True if all characters in the string are ASCII, False otherwise.
11791
11792ASCII characters have code points in the range U+0000-U+007F.
11793Empty string is ASCII too.
11794[clinic start generated code]*/
11795
11796static PyObject *
11797unicode_isascii_impl(PyObject *self)
11798/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11799{
11800 if (PyUnicode_READY(self) == -1) {
11801 return NULL;
11802 }
11803 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11804}
11805
11806/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011807str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
INADA Naoki3ae20562017-01-16 20:41:20 +090011809Return True if the string is a lowercase string, False otherwise.
11810
11811A string is lowercase if all cased characters in the string are lowercase and
11812there is at least one cased character in the string.
11813[clinic start generated code]*/
11814
11815static PyObject *
11816unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011817/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 Py_ssize_t i, length;
11820 int kind;
11821 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 int cased;
11823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (PyUnicode_READY(self) == -1)
11825 return NULL;
11826 length = PyUnicode_GET_LENGTH(self);
11827 kind = PyUnicode_KIND(self);
11828 data = PyUnicode_DATA(self);
11829
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (length == 1)
11832 return PyBool_FromLong(
11833 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011835 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011837 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011838
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 for (i = 0; i < length; i++) {
11841 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011842
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011844 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 else if (!cased && Py_UNICODE_ISLOWER(ch))
11846 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011848 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849}
11850
INADA Naoki3ae20562017-01-16 20:41:20 +090011851/*[clinic input]
11852str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853
INADA Naoki3ae20562017-01-16 20:41:20 +090011854Return True if the string is an uppercase string, False otherwise.
11855
11856A string is uppercase if all cased characters in the string are uppercase and
11857there is at least one cased character in the string.
11858[clinic start generated code]*/
11859
11860static PyObject *
11861unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011862/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 Py_ssize_t i, length;
11865 int kind;
11866 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 int cased;
11868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 if (PyUnicode_READY(self) == -1)
11870 return NULL;
11871 length = PyUnicode_GET_LENGTH(self);
11872 kind = PyUnicode_KIND(self);
11873 data = PyUnicode_DATA(self);
11874
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (length == 1)
11877 return PyBool_FromLong(
11878 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011880 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011882 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011883
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 for (i = 0; i < length; i++) {
11886 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011887
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011889 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 else if (!cased && Py_UNICODE_ISUPPER(ch))
11891 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011893 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
INADA Naoki3ae20562017-01-16 20:41:20 +090011896/*[clinic input]
11897str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
INADA Naoki3ae20562017-01-16 20:41:20 +090011899Return True if the string is a title-cased string, False otherwise.
11900
11901In a title-cased string, upper- and title-case characters may only
11902follow uncased characters and lowercase characters only cased ones.
11903[clinic start generated code]*/
11904
11905static PyObject *
11906unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011907/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 Py_ssize_t i, length;
11910 int kind;
11911 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912 int cased, previous_is_cased;
11913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (PyUnicode_READY(self) == -1)
11915 return NULL;
11916 length = PyUnicode_GET_LENGTH(self);
11917 kind = PyUnicode_KIND(self);
11918 data = PyUnicode_DATA(self);
11919
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 if (length == 1) {
11922 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11923 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11924 (Py_UNICODE_ISUPPER(ch) != 0));
11925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011927 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011929 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011930
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 cased = 0;
11932 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 for (i = 0; i < length; i++) {
11934 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011935
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11937 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011938 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 previous_is_cased = 1;
11940 cased = 1;
11941 }
11942 else if (Py_UNICODE_ISLOWER(ch)) {
11943 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011944 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 previous_is_cased = 1;
11946 cased = 1;
11947 }
11948 else
11949 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011951 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952}
11953
INADA Naoki3ae20562017-01-16 20:41:20 +090011954/*[clinic input]
11955str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
INADA Naoki3ae20562017-01-16 20:41:20 +090011957Return True if the string is a whitespace string, False otherwise.
11958
11959A string is whitespace if all characters in the string are whitespace and there
11960is at least one character in the string.
11961[clinic start generated code]*/
11962
11963static PyObject *
11964unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011965/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 Py_ssize_t i, length;
11968 int kind;
11969 void *data;
11970
11971 if (PyUnicode_READY(self) == -1)
11972 return NULL;
11973 length = PyUnicode_GET_LENGTH(self);
11974 kind = PyUnicode_KIND(self);
11975 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (length == 1)
11979 return PyBool_FromLong(
11980 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011982 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011984 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 for (i = 0; i < length; i++) {
11987 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011988 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011989 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011991 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
INADA Naoki3ae20562017-01-16 20:41:20 +090011994/*[clinic input]
11995str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011996
INADA Naoki3ae20562017-01-16 20:41:20 +090011997Return True if the string is an alphabetic string, False otherwise.
11998
11999A string is alphabetic if all characters in the string are alphabetic and there
12000is at least one character in the string.
12001[clinic start generated code]*/
12002
12003static PyObject *
12004unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012005/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012006{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 Py_ssize_t i, length;
12008 int kind;
12009 void *data;
12010
12011 if (PyUnicode_READY(self) == -1)
12012 return NULL;
12013 length = PyUnicode_GET_LENGTH(self);
12014 kind = PyUnicode_KIND(self);
12015 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012016
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012017 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (length == 1)
12019 return PyBool_FromLong(
12020 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012021
12022 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012024 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 for (i = 0; i < length; i++) {
12027 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012028 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012029 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012030 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012031}
12032
INADA Naoki3ae20562017-01-16 20:41:20 +090012033/*[clinic input]
12034str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012035
INADA Naoki3ae20562017-01-16 20:41:20 +090012036Return True if the string is an alpha-numeric string, False otherwise.
12037
12038A string is alpha-numeric if all characters in the string are alpha-numeric and
12039there is at least one character in the string.
12040[clinic start generated code]*/
12041
12042static PyObject *
12043unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012044/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 int kind;
12047 void *data;
12048 Py_ssize_t len, i;
12049
12050 if (PyUnicode_READY(self) == -1)
12051 return NULL;
12052
12053 kind = PyUnicode_KIND(self);
12054 data = PyUnicode_DATA(self);
12055 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012056
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012057 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if (len == 1) {
12059 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12060 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12061 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012062
12063 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012065 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 for (i = 0; i < len; i++) {
12068 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012069 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012071 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012072 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012073}
12074
INADA Naoki3ae20562017-01-16 20:41:20 +090012075/*[clinic input]
12076str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
INADA Naoki3ae20562017-01-16 20:41:20 +090012078Return True if the string is a decimal string, False otherwise.
12079
12080A string is a decimal string if all characters in the string are decimal and
12081there is at least one character in the string.
12082[clinic start generated code]*/
12083
12084static PyObject *
12085unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012086/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 Py_ssize_t i, length;
12089 int kind;
12090 void *data;
12091
12092 if (PyUnicode_READY(self) == -1)
12093 return NULL;
12094 length = PyUnicode_GET_LENGTH(self);
12095 kind = PyUnicode_KIND(self);
12096 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (length == 1)
12100 return PyBool_FromLong(
12101 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012103 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012105 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 for (i = 0; i < length; i++) {
12108 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012109 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012111 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112}
12113
INADA Naoki3ae20562017-01-16 20:41:20 +090012114/*[clinic input]
12115str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
INADA Naoki3ae20562017-01-16 20:41:20 +090012117Return True if the string is a digit string, False otherwise.
12118
12119A string is a digit string if all characters in the string are digits and there
12120is at least one character in the string.
12121[clinic start generated code]*/
12122
12123static PyObject *
12124unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012125/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 Py_ssize_t i, length;
12128 int kind;
12129 void *data;
12130
12131 if (PyUnicode_READY(self) == -1)
12132 return NULL;
12133 length = PyUnicode_GET_LENGTH(self);
12134 kind = PyUnicode_KIND(self);
12135 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 if (length == 1) {
12139 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12140 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012143 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012145 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 for (i = 0; i < length; i++) {
12148 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012149 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012151 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152}
12153
INADA Naoki3ae20562017-01-16 20:41:20 +090012154/*[clinic input]
12155str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
INADA Naoki3ae20562017-01-16 20:41:20 +090012157Return True if the string is a numeric string, False otherwise.
12158
12159A string is numeric if all characters in the string are numeric and there is at
12160least one character in the string.
12161[clinic start generated code]*/
12162
12163static PyObject *
12164unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012165/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 Py_ssize_t i, length;
12168 int kind;
12169 void *data;
12170
12171 if (PyUnicode_READY(self) == -1)
12172 return NULL;
12173 length = PyUnicode_GET_LENGTH(self);
12174 kind = PyUnicode_KIND(self);
12175 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (length == 1)
12179 return PyBool_FromLong(
12180 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012182 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012184 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 for (i = 0; i < length; i++) {
12187 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012188 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012190 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191}
12192
Martin v. Löwis47383402007-08-15 07:32:56 +000012193int
12194PyUnicode_IsIdentifier(PyObject *self)
12195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 int kind;
12197 void *data;
12198 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012199 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (PyUnicode_READY(self) == -1) {
12202 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 }
12205
12206 /* Special case for empty strings */
12207 if (PyUnicode_GET_LENGTH(self) == 0)
12208 return 0;
12209 kind = PyUnicode_KIND(self);
12210 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012211
12212 /* PEP 3131 says that the first character must be in
12213 XID_Start and subsequent characters in XID_Continue,
12214 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012215 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012216 letters, digits, underscore). However, given the current
12217 definition of XID_Start and XID_Continue, it is sufficient
12218 to check just for these, except that _ must be allowed
12219 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012221 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012222 return 0;
12223
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012224 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012227 return 1;
12228}
12229
INADA Naoki3ae20562017-01-16 20:41:20 +090012230/*[clinic input]
12231str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012232
INADA Naoki3ae20562017-01-16 20:41:20 +090012233Return True if the string is a valid Python identifier, False otherwise.
12234
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012235Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012236such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012237[clinic start generated code]*/
12238
12239static PyObject *
12240unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012241/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012242{
12243 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12244}
12245
INADA Naoki3ae20562017-01-16 20:41:20 +090012246/*[clinic input]
12247str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012248
INADA Naoki3ae20562017-01-16 20:41:20 +090012249Return True if the string is printable, False otherwise.
12250
12251A string is printable if all of its characters are considered printable in
12252repr() or if it is empty.
12253[clinic start generated code]*/
12254
12255static PyObject *
12256unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012257/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 Py_ssize_t i, length;
12260 int kind;
12261 void *data;
12262
12263 if (PyUnicode_READY(self) == -1)
12264 return NULL;
12265 length = PyUnicode_GET_LENGTH(self);
12266 kind = PyUnicode_KIND(self);
12267 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012268
12269 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 if (length == 1)
12271 return PyBool_FromLong(
12272 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 for (i = 0; i < length; i++) {
12275 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012276 Py_RETURN_FALSE;
12277 }
12278 }
12279 Py_RETURN_TRUE;
12280}
12281
INADA Naoki3ae20562017-01-16 20:41:20 +090012282/*[clinic input]
12283str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284
INADA Naoki3ae20562017-01-16 20:41:20 +090012285 iterable: object
12286 /
12287
12288Concatenate any number of strings.
12289
Martin Panter91a88662017-01-24 00:30:06 +000012290The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012291The result is returned as a new string.
12292
12293Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12294[clinic start generated code]*/
12295
12296static PyObject *
12297unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012298/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299{
INADA Naoki3ae20562017-01-16 20:41:20 +090012300 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301}
12302
Martin v. Löwis18e16552006-02-15 17:27:45 +000012303static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012304unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 if (PyUnicode_READY(self) == -1)
12307 return -1;
12308 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309}
12310
INADA Naoki3ae20562017-01-16 20:41:20 +090012311/*[clinic input]
12312str.ljust as unicode_ljust
12313
12314 width: Py_ssize_t
12315 fillchar: Py_UCS4 = ' '
12316 /
12317
12318Return a left-justified string of length width.
12319
12320Padding is done using the specified fill character (default is a space).
12321[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
12323static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012324unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12325/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012327 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
Victor Stinnerc4b49542011-12-11 22:44:26 +010012330 if (PyUnicode_GET_LENGTH(self) >= width)
12331 return unicode_result_unchanged(self);
12332
12333 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334}
12335
INADA Naoki3ae20562017-01-16 20:41:20 +090012336/*[clinic input]
12337str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338
INADA Naoki3ae20562017-01-16 20:41:20 +090012339Return a copy of the string converted to lowercase.
12340[clinic start generated code]*/
12341
12342static PyObject *
12343unicode_lower_impl(PyObject *self)
12344/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012346 if (PyUnicode_READY(self) == -1)
12347 return NULL;
12348 if (PyUnicode_IS_ASCII(self))
12349 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012350 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351}
12352
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012353#define LEFTSTRIP 0
12354#define RIGHTSTRIP 1
12355#define BOTHSTRIP 2
12356
12357/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012358static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012359
INADA Naoki3ae20562017-01-16 20:41:20 +090012360#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012361
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012362/* externally visible for str.strip(unicode) */
12363PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012364_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 void *data;
12367 int kind;
12368 Py_ssize_t i, j, len;
12369 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012370 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12373 return NULL;
12374
12375 kind = PyUnicode_KIND(self);
12376 data = PyUnicode_DATA(self);
12377 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012378 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12380 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012381 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012382
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383 i = 0;
12384 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012385 while (i < len) {
12386 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12387 if (!BLOOM(sepmask, ch))
12388 break;
12389 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12390 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012391 i++;
12392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012393 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012394
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 j = len;
12396 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012397 j--;
12398 while (j >= i) {
12399 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12400 if (!BLOOM(sepmask, ch))
12401 break;
12402 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12403 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012405 }
12406
Benjamin Peterson29060642009-01-31 22:14:21 +000012407 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012409
Victor Stinner7931d9a2011-11-04 00:22:48 +010012410 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411}
12412
12413PyObject*
12414PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12415{
12416 unsigned char *data;
12417 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012418 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419
Victor Stinnerde636f32011-10-01 03:55:54 +020012420 if (PyUnicode_READY(self) == -1)
12421 return NULL;
12422
Victor Stinner684d5fd2012-05-03 02:32:34 +020012423 length = PyUnicode_GET_LENGTH(self);
12424 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012425
Victor Stinner684d5fd2012-05-03 02:32:34 +020012426 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012427 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428
Victor Stinnerde636f32011-10-01 03:55:54 +020012429 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012430 PyErr_SetString(PyExc_IndexError, "string index out of range");
12431 return NULL;
12432 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012433 if (start >= length || end < start)
12434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012435
Victor Stinner684d5fd2012-05-03 02:32:34 +020012436 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012437 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012438 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012439 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012440 }
12441 else {
12442 kind = PyUnicode_KIND(self);
12443 data = PyUnicode_1BYTE_DATA(self);
12444 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012445 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012446 length);
12447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449
12450static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012451do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 Py_ssize_t len, i, j;
12454
12455 if (PyUnicode_READY(self) == -1)
12456 return NULL;
12457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012459
Victor Stinnercc7af722013-04-09 22:39:24 +020012460 if (PyUnicode_IS_ASCII(self)) {
12461 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12462
12463 i = 0;
12464 if (striptype != RIGHTSTRIP) {
12465 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012466 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012467 if (!_Py_ascii_whitespace[ch])
12468 break;
12469 i++;
12470 }
12471 }
12472
12473 j = len;
12474 if (striptype != LEFTSTRIP) {
12475 j--;
12476 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012477 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012478 if (!_Py_ascii_whitespace[ch])
12479 break;
12480 j--;
12481 }
12482 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012483 }
12484 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012485 else {
12486 int kind = PyUnicode_KIND(self);
12487 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488
Victor Stinnercc7af722013-04-09 22:39:24 +020012489 i = 0;
12490 if (striptype != RIGHTSTRIP) {
12491 while (i < len) {
12492 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12493 if (!Py_UNICODE_ISSPACE(ch))
12494 break;
12495 i++;
12496 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012497 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012498
12499 j = len;
12500 if (striptype != LEFTSTRIP) {
12501 j--;
12502 while (j >= i) {
12503 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12504 if (!Py_UNICODE_ISSPACE(ch))
12505 break;
12506 j--;
12507 }
12508 j++;
12509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511
Victor Stinner7931d9a2011-11-04 00:22:48 +010012512 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513}
12514
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012515
12516static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012517do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012518{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012519 if (sep != NULL && sep != Py_None) {
12520 if (PyUnicode_Check(sep))
12521 return _PyUnicode_XStrip(self, striptype, sep);
12522 else {
12523 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 "%s arg must be None or str",
12525 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012526 return NULL;
12527 }
12528 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012529
Benjamin Peterson14339b62009-01-31 16:36:08 +000012530 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012531}
12532
12533
INADA Naoki3ae20562017-01-16 20:41:20 +090012534/*[clinic input]
12535str.strip as unicode_strip
12536
12537 chars: object = None
12538 /
12539
Victor Stinner0c4a8282017-01-17 02:21:47 +010012540Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012541
12542If chars is given and not None, remove characters in chars instead.
12543[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012544
12545static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012546unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012547/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012548{
INADA Naoki3ae20562017-01-16 20:41:20 +090012549 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012550}
12551
12552
INADA Naoki3ae20562017-01-16 20:41:20 +090012553/*[clinic input]
12554str.lstrip as unicode_lstrip
12555
12556 chars: object = NULL
12557 /
12558
12559Return a copy of the string with leading whitespace removed.
12560
12561If chars is given and not None, remove characters in chars instead.
12562[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012563
12564static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012565unicode_lstrip_impl(PyObject *self, PyObject *chars)
12566/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012567{
INADA Naoki3ae20562017-01-16 20:41:20 +090012568 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012569}
12570
12571
INADA Naoki3ae20562017-01-16 20:41:20 +090012572/*[clinic input]
12573str.rstrip as unicode_rstrip
12574
12575 chars: object = NULL
12576 /
12577
12578Return a copy of the string with trailing whitespace removed.
12579
12580If chars is given and not None, remove characters in chars instead.
12581[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012582
12583static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012584unicode_rstrip_impl(PyObject *self, PyObject *chars)
12585/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012586{
INADA Naoki3ae20562017-01-16 20:41:20 +090012587 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012588}
12589
12590
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012592unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012594 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596
Serhiy Storchaka05997252013-01-26 12:14:02 +020012597 if (len < 1)
12598 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599
Victor Stinnerc4b49542011-12-11 22:44:26 +010012600 /* no repeat, return original string */
12601 if (len == 1)
12602 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012603
Benjamin Petersonbac79492012-01-14 13:34:47 -050012604 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 return NULL;
12606
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012607 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012608 PyErr_SetString(PyExc_OverflowError,
12609 "repeated string is too long");
12610 return NULL;
12611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012613
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012614 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615 if (!u)
12616 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012617 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 if (PyUnicode_GET_LENGTH(str) == 1) {
12620 const int kind = PyUnicode_KIND(str);
12621 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012622 if (kind == PyUnicode_1BYTE_KIND) {
12623 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012624 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012625 }
12626 else if (kind == PyUnicode_2BYTE_KIND) {
12627 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012628 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012629 ucs2[n] = fill_char;
12630 } else {
12631 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12632 assert(kind == PyUnicode_4BYTE_KIND);
12633 for (n = 0; n < len; ++n)
12634 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 }
12637 else {
12638 /* number of characters copied this far */
12639 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012640 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012642 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012646 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012647 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649 }
12650
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012651 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012652 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653}
12654
Alexander Belopolsky40018472011-02-26 01:02:56 +000012655PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012656PyUnicode_Replace(PyObject *str,
12657 PyObject *substr,
12658 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012659 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012661 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12662 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012664 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665}
12666
INADA Naoki3ae20562017-01-16 20:41:20 +090012667/*[clinic input]
12668str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669
INADA Naoki3ae20562017-01-16 20:41:20 +090012670 old: unicode
12671 new: unicode
12672 count: Py_ssize_t = -1
12673 Maximum number of occurrences to replace.
12674 -1 (the default value) means replace all occurrences.
12675 /
12676
12677Return a copy with all occurrences of substring old replaced by new.
12678
12679If the optional argument count is given, only the first count occurrences are
12680replaced.
12681[clinic start generated code]*/
12682
12683static PyObject *
12684unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12685 Py_ssize_t count)
12686/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012688 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012690 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691}
12692
Alexander Belopolsky40018472011-02-26 01:02:56 +000012693static PyObject *
12694unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012696 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 Py_ssize_t isize;
12698 Py_ssize_t osize, squote, dquote, i, o;
12699 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012700 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012704 return NULL;
12705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 isize = PyUnicode_GET_LENGTH(unicode);
12707 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 /* Compute length of output, quote characters, and
12710 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012711 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 max = 127;
12713 squote = dquote = 0;
12714 ikind = PyUnicode_KIND(unicode);
12715 for (i = 0; i < isize; i++) {
12716 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012717 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012719 case '\'': squote++; break;
12720 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012722 incr = 2;
12723 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 default:
12725 /* Fast-path ASCII */
12726 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012727 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012729 ;
12730 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012733 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012735 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012737 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012739 if (osize > PY_SSIZE_T_MAX - incr) {
12740 PyErr_SetString(PyExc_OverflowError,
12741 "string is too long to generate repr");
12742 return NULL;
12743 }
12744 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 }
12746
12747 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012748 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012750 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 if (dquote)
12752 /* Both squote and dquote present. Use squote,
12753 and escape them */
12754 osize += squote;
12755 else
12756 quote = '"';
12757 }
Victor Stinner55c08782013-04-14 18:45:39 +020012758 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759
12760 repr = PyUnicode_New(osize, max);
12761 if (repr == NULL)
12762 return NULL;
12763 okind = PyUnicode_KIND(repr);
12764 odata = PyUnicode_DATA(repr);
12765
12766 PyUnicode_WRITE(okind, odata, 0, quote);
12767 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012768 if (unchanged) {
12769 _PyUnicode_FastCopyCharacters(repr, 1,
12770 unicode, 0,
12771 isize);
12772 }
12773 else {
12774 for (i = 0, o = 1; i < isize; i++) {
12775 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776
Victor Stinner55c08782013-04-14 18:45:39 +020012777 /* Escape quotes and backslashes */
12778 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012779 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012781 continue;
12782 }
12783
12784 /* Map special whitespace to '\t', \n', '\r' */
12785 if (ch == '\t') {
12786 PyUnicode_WRITE(okind, odata, o++, '\\');
12787 PyUnicode_WRITE(okind, odata, o++, 't');
12788 }
12789 else if (ch == '\n') {
12790 PyUnicode_WRITE(okind, odata, o++, '\\');
12791 PyUnicode_WRITE(okind, odata, o++, 'n');
12792 }
12793 else if (ch == '\r') {
12794 PyUnicode_WRITE(okind, odata, o++, '\\');
12795 PyUnicode_WRITE(okind, odata, o++, 'r');
12796 }
12797
12798 /* Map non-printable US ASCII to '\xhh' */
12799 else if (ch < ' ' || ch == 0x7F) {
12800 PyUnicode_WRITE(okind, odata, o++, '\\');
12801 PyUnicode_WRITE(okind, odata, o++, 'x');
12802 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12803 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12804 }
12805
12806 /* Copy ASCII characters as-is */
12807 else if (ch < 0x7F) {
12808 PyUnicode_WRITE(okind, odata, o++, ch);
12809 }
12810
12811 /* Non-ASCII characters */
12812 else {
12813 /* Map Unicode whitespace and control characters
12814 (categories Z* and C* except ASCII space)
12815 */
12816 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12817 PyUnicode_WRITE(okind, odata, o++, '\\');
12818 /* Map 8-bit characters to '\xhh' */
12819 if (ch <= 0xff) {
12820 PyUnicode_WRITE(okind, odata, o++, 'x');
12821 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12822 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12823 }
12824 /* Map 16-bit characters to '\uxxxx' */
12825 else if (ch <= 0xffff) {
12826 PyUnicode_WRITE(okind, odata, o++, 'u');
12827 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12828 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12829 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12830 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12831 }
12832 /* Map 21-bit characters to '\U00xxxxxx' */
12833 else {
12834 PyUnicode_WRITE(okind, odata, o++, 'U');
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12836 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12837 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12838 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12839 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12840 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12841 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12842 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12843 }
12844 }
12845 /* Copy characters as-is */
12846 else {
12847 PyUnicode_WRITE(okind, odata, o++, ch);
12848 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012849 }
12850 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012853 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012854 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855}
12856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012857PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859\n\
12860Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012861such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862arguments start and end are interpreted as in slice notation.\n\
12863\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012864Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865
12866static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012869 /* initialize variables to prevent gcc warning */
12870 PyObject *substring = NULL;
12871 Py_ssize_t start = 0;
12872 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012873 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012875 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012878 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012881 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 if (result == -2)
12884 return NULL;
12885
Christian Heimes217cfd12007-12-02 14:31:20 +000012886 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887}
12888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012889PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012892Return the highest index in S where substring sub is found,\n\
12893such that sub is contained within S[start:end]. Optional\n\
12894arguments start and end are interpreted as in slice notation.\n\
12895\n\
12896Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897
12898static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012901 /* initialize variables to prevent gcc warning */
12902 PyObject *substring = NULL;
12903 Py_ssize_t start = 0;
12904 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012905 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012907 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012910 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012913 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 if (result == -2)
12916 return NULL;
12917
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918 if (result < 0) {
12919 PyErr_SetString(PyExc_ValueError, "substring not found");
12920 return NULL;
12921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922
Christian Heimes217cfd12007-12-02 14:31:20 +000012923 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924}
12925
INADA Naoki3ae20562017-01-16 20:41:20 +090012926/*[clinic input]
12927str.rjust as unicode_rjust
12928
12929 width: Py_ssize_t
12930 fillchar: Py_UCS4 = ' '
12931 /
12932
12933Return a right-justified string of length width.
12934
12935Padding is done using the specified fill character (default is a space).
12936[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937
12938static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012939unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12940/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012942 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943 return NULL;
12944
Victor Stinnerc4b49542011-12-11 22:44:26 +010012945 if (PyUnicode_GET_LENGTH(self) >= width)
12946 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947
Victor Stinnerc4b49542011-12-11 22:44:26 +010012948 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949}
12950
Alexander Belopolsky40018472011-02-26 01:02:56 +000012951PyObject *
12952PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012954 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012957 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958}
12959
INADA Naoki3ae20562017-01-16 20:41:20 +090012960/*[clinic input]
12961str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962
INADA Naoki3ae20562017-01-16 20:41:20 +090012963 sep: object = None
12964 The delimiter according which to split the string.
12965 None (the default value) means split according to any whitespace,
12966 and discard empty strings from the result.
12967 maxsplit: Py_ssize_t = -1
12968 Maximum number of splits to do.
12969 -1 (the default value) means no limit.
12970
12971Return a list of the words in the string, using sep as the delimiter string.
12972[clinic start generated code]*/
12973
12974static PyObject *
12975unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12976/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977{
INADA Naoki3ae20562017-01-16 20:41:20 +090012978 if (sep == Py_None)
12979 return split(self, NULL, maxsplit);
12980 if (PyUnicode_Check(sep))
12981 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012982
Victor Stinner998b8062018-09-12 00:23:25 +020012983 PyErr_Format(PyExc_TypeError,
12984 "must be str or None, not %.100s",
12985 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987}
12988
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012990PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012991{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012993 int kind1, kind2;
12994 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012997 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012999
Victor Stinner14f8f022011-10-05 20:58:25 +020013000 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 len1 = PyUnicode_GET_LENGTH(str_obj);
13003 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013004 if (kind1 < kind2 || len1 < len2) {
13005 _Py_INCREF_UNICODE_EMPTY();
13006 if (!unicode_empty)
13007 out = NULL;
13008 else {
13009 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13010 Py_DECREF(unicode_empty);
13011 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013012 return out;
13013 }
13014 buf1 = PyUnicode_DATA(str_obj);
13015 buf2 = PyUnicode_DATA(sep_obj);
13016 if (kind2 != kind1) {
13017 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13018 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013019 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013022 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013024 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13025 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13026 else
13027 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 break;
13029 case PyUnicode_2BYTE_KIND:
13030 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13031 break;
13032 case PyUnicode_4BYTE_KIND:
13033 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13034 break;
13035 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013036 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013038
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013039 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013041
13042 return out;
13043}
13044
13045
13046PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013047PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013048{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013050 int kind1, kind2;
13051 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013052 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013054 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013057 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 len1 = PyUnicode_GET_LENGTH(str_obj);
13060 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013061 if (kind1 < kind2 || len1 < len2) {
13062 _Py_INCREF_UNICODE_EMPTY();
13063 if (!unicode_empty)
13064 out = NULL;
13065 else {
13066 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13067 Py_DECREF(unicode_empty);
13068 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013069 return out;
13070 }
13071 buf1 = PyUnicode_DATA(str_obj);
13072 buf2 = PyUnicode_DATA(sep_obj);
13073 if (kind2 != kind1) {
13074 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13075 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013076 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013079 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013081 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13082 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13083 else
13084 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 break;
13086 case PyUnicode_2BYTE_KIND:
13087 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13088 break;
13089 case PyUnicode_4BYTE_KIND:
13090 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13091 break;
13092 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013093 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013095
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013096 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013098
13099 return out;
13100}
13101
INADA Naoki3ae20562017-01-16 20:41:20 +090013102/*[clinic input]
13103str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013104
INADA Naoki3ae20562017-01-16 20:41:20 +090013105 sep: object
13106 /
13107
13108Partition the string into three parts using the given separator.
13109
13110This will search for the separator in the string. If the separator is found,
13111returns a 3-tuple containing the part before the separator, the separator
13112itself, and the part after it.
13113
13114If the separator is not found, returns a 3-tuple containing the original string
13115and two empty strings.
13116[clinic start generated code]*/
13117
13118static PyObject *
13119unicode_partition(PyObject *self, PyObject *sep)
13120/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013121{
INADA Naoki3ae20562017-01-16 20:41:20 +090013122 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013123}
13124
INADA Naoki3ae20562017-01-16 20:41:20 +090013125/*[clinic input]
13126str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013127
INADA Naoki3ae20562017-01-16 20:41:20 +090013128Partition the string into three parts using the given separator.
13129
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013130This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013131the separator is found, returns a 3-tuple containing the part before the
13132separator, the separator itself, and the part after it.
13133
13134If the separator is not found, returns a 3-tuple containing two empty strings
13135and the original string.
13136[clinic start generated code]*/
13137
13138static PyObject *
13139unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013140/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013141{
INADA Naoki3ae20562017-01-16 20:41:20 +090013142 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013143}
13144
Alexander Belopolsky40018472011-02-26 01:02:56 +000013145PyObject *
13146PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013147{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013148 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013150
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013151 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013152}
13153
INADA Naoki3ae20562017-01-16 20:41:20 +090013154/*[clinic input]
13155str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013156
INADA Naoki3ae20562017-01-16 20:41:20 +090013157Return a list of the words in the string, using sep as the delimiter string.
13158
13159Splits are done starting at the end of the string and working to the front.
13160[clinic start generated code]*/
13161
13162static PyObject *
13163unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13164/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013165{
INADA Naoki3ae20562017-01-16 20:41:20 +090013166 if (sep == Py_None)
13167 return rsplit(self, NULL, maxsplit);
13168 if (PyUnicode_Check(sep))
13169 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013170
Victor Stinner998b8062018-09-12 00:23:25 +020013171 PyErr_Format(PyExc_TypeError,
13172 "must be str or None, not %.100s",
13173 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013174 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013175}
13176
INADA Naoki3ae20562017-01-16 20:41:20 +090013177/*[clinic input]
13178str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013180 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013181
13182Return a list of the lines in the string, breaking at line boundaries.
13183
13184Line breaks are not included in the resulting list unless keepends is given and
13185true.
13186[clinic start generated code]*/
13187
13188static PyObject *
13189unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013190/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013192 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193}
13194
13195static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013196PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013198 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199}
13200
INADA Naoki3ae20562017-01-16 20:41:20 +090013201/*[clinic input]
13202str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203
INADA Naoki3ae20562017-01-16 20:41:20 +090013204Convert uppercase characters to lowercase and lowercase characters to uppercase.
13205[clinic start generated code]*/
13206
13207static PyObject *
13208unicode_swapcase_impl(PyObject *self)
13209/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013211 if (PyUnicode_READY(self) == -1)
13212 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013213 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214}
13215
Larry Hastings61272b72014-01-07 12:41:53 -080013216/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013217
Larry Hastings31826802013-10-19 00:09:25 -070013218@staticmethod
13219str.maketrans as unicode_maketrans
13220
13221 x: object
13222
13223 y: unicode=NULL
13224
13225 z: unicode=NULL
13226
13227 /
13228
13229Return a translation table usable for str.translate().
13230
13231If there is only one argument, it must be a dictionary mapping Unicode
13232ordinals (integers) or characters to Unicode ordinals, strings or None.
13233Character keys will be then converted to ordinals.
13234If there are two arguments, they must be strings of equal length, and
13235in the resulting dictionary, each character in x will be mapped to the
13236character at the same position in y. If there is a third argument, it
13237must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013238[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013239
Larry Hastings31826802013-10-19 00:09:25 -070013240static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013241unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013242/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013243{
Georg Brandlceee0772007-11-27 23:48:05 +000013244 PyObject *new = NULL, *key, *value;
13245 Py_ssize_t i = 0;
13246 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013247
Georg Brandlceee0772007-11-27 23:48:05 +000013248 new = PyDict_New();
13249 if (!new)
13250 return NULL;
13251 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 int x_kind, y_kind, z_kind;
13253 void *x_data, *y_data, *z_data;
13254
Georg Brandlceee0772007-11-27 23:48:05 +000013255 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013256 if (!PyUnicode_Check(x)) {
13257 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13258 "be a string if there is a second argument");
13259 goto err;
13260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013262 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13263 "arguments must have equal length");
13264 goto err;
13265 }
13266 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 x_kind = PyUnicode_KIND(x);
13268 y_kind = PyUnicode_KIND(y);
13269 x_data = PyUnicode_DATA(x);
13270 y_data = PyUnicode_DATA(y);
13271 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13272 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013273 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013274 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013275 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013276 if (!value) {
13277 Py_DECREF(key);
13278 goto err;
13279 }
Georg Brandlceee0772007-11-27 23:48:05 +000013280 res = PyDict_SetItem(new, key, value);
13281 Py_DECREF(key);
13282 Py_DECREF(value);
13283 if (res < 0)
13284 goto err;
13285 }
13286 /* create entries for deleting chars in z */
13287 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 z_kind = PyUnicode_KIND(z);
13289 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013290 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013292 if (!key)
13293 goto err;
13294 res = PyDict_SetItem(new, key, Py_None);
13295 Py_DECREF(key);
13296 if (res < 0)
13297 goto err;
13298 }
13299 }
13300 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 int kind;
13302 void *data;
13303
Georg Brandlceee0772007-11-27 23:48:05 +000013304 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013305 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013306 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13307 "to maketrans it must be a dict");
13308 goto err;
13309 }
13310 /* copy entries into the new dict, converting string keys to int keys */
13311 while (PyDict_Next(x, &i, &key, &value)) {
13312 if (PyUnicode_Check(key)) {
13313 /* convert string keys to integer keys */
13314 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013315 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013316 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13317 "table must be of length 1");
13318 goto err;
13319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 kind = PyUnicode_KIND(key);
13321 data = PyUnicode_DATA(key);
13322 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013323 if (!newkey)
13324 goto err;
13325 res = PyDict_SetItem(new, newkey, value);
13326 Py_DECREF(newkey);
13327 if (res < 0)
13328 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013329 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013330 /* just keep integer keys */
13331 if (PyDict_SetItem(new, key, value) < 0)
13332 goto err;
13333 } else {
13334 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13335 "be strings or integers");
13336 goto err;
13337 }
13338 }
13339 }
13340 return new;
13341 err:
13342 Py_DECREF(new);
13343 return NULL;
13344}
13345
INADA Naoki3ae20562017-01-16 20:41:20 +090013346/*[clinic input]
13347str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013348
INADA Naoki3ae20562017-01-16 20:41:20 +090013349 table: object
13350 Translation table, which must be a mapping of Unicode ordinals to
13351 Unicode ordinals, strings, or None.
13352 /
13353
13354Replace each character in the string using the given translation table.
13355
13356The table must implement lookup/indexing via __getitem__, for instance a
13357dictionary or list. If this operation raises LookupError, the character is
13358left untouched. Characters mapped to None are deleted.
13359[clinic start generated code]*/
13360
13361static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013363/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366}
13367
INADA Naoki3ae20562017-01-16 20:41:20 +090013368/*[clinic input]
13369str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370
INADA Naoki3ae20562017-01-16 20:41:20 +090013371Return a copy of the string converted to uppercase.
13372[clinic start generated code]*/
13373
13374static PyObject *
13375unicode_upper_impl(PyObject *self)
13376/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013378 if (PyUnicode_READY(self) == -1)
13379 return NULL;
13380 if (PyUnicode_IS_ASCII(self))
13381 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013382 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383}
13384
INADA Naoki3ae20562017-01-16 20:41:20 +090013385/*[clinic input]
13386str.zfill as unicode_zfill
13387
13388 width: Py_ssize_t
13389 /
13390
13391Pad a numeric string with zeros on the left, to fill a field of the given width.
13392
13393The string is never truncated.
13394[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395
13396static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013397unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013398/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013400 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013401 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402 int kind;
13403 void *data;
13404 Py_UCS4 chr;
13405
Benjamin Petersonbac79492012-01-14 13:34:47 -050013406 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013408
Victor Stinnerc4b49542011-12-11 22:44:26 +010013409 if (PyUnicode_GET_LENGTH(self) >= width)
13410 return unicode_result_unchanged(self);
13411
13412 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413
13414 u = pad(self, fill, 0, '0');
13415
Walter Dörwald068325e2002-04-15 13:36:47 +000013416 if (u == NULL)
13417 return NULL;
13418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 kind = PyUnicode_KIND(u);
13420 data = PyUnicode_DATA(u);
13421 chr = PyUnicode_READ(kind, data, fill);
13422
13423 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013425 PyUnicode_WRITE(kind, data, 0, chr);
13426 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427 }
13428
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013429 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013430 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432
13433#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013434static PyObject *
13435unicode__decimal2ascii(PyObject *self)
13436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013438}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439#endif
13440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013441PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013444Return True if S starts with the specified prefix, False otherwise.\n\
13445With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013446With optional end, stop comparing S at that position.\n\
13447prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448
13449static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013450unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013452{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013453 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013454 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013455 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013456 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013457 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013458
Jesus Ceaac451502011-04-20 17:09:23 +020013459 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013461 if (PyTuple_Check(subobj)) {
13462 Py_ssize_t i;
13463 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013464 substring = PyTuple_GET_ITEM(subobj, i);
13465 if (!PyUnicode_Check(substring)) {
13466 PyErr_Format(PyExc_TypeError,
13467 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013468 "not %.100s",
13469 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013470 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013471 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013472 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013473 if (result == -1)
13474 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 if (result) {
13476 Py_RETURN_TRUE;
13477 }
13478 }
13479 /* nothing matched */
13480 Py_RETURN_FALSE;
13481 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013482 if (!PyUnicode_Check(subobj)) {
13483 PyErr_Format(PyExc_TypeError,
13484 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013485 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013487 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013488 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013489 if (result == -1)
13490 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013491 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013492}
13493
13494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013495PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013497\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013498Return True if S ends with the specified suffix, False otherwise.\n\
13499With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013500With optional end, stop comparing S at that position.\n\
13501suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502
13503static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013504unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013505 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013506{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013507 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013508 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013509 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013510 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013511 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013512
Jesus Ceaac451502011-04-20 17:09:23 +020013513 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013514 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013515 if (PyTuple_Check(subobj)) {
13516 Py_ssize_t i;
13517 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013518 substring = PyTuple_GET_ITEM(subobj, i);
13519 if (!PyUnicode_Check(substring)) {
13520 PyErr_Format(PyExc_TypeError,
13521 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013522 "not %.100s",
13523 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013525 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013526 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013527 if (result == -1)
13528 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013529 if (result) {
13530 Py_RETURN_TRUE;
13531 }
13532 }
13533 Py_RETURN_FALSE;
13534 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013535 if (!PyUnicode_Check(subobj)) {
13536 PyErr_Format(PyExc_TypeError,
13537 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013538 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013540 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013541 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013542 if (result == -1)
13543 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013544 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013545}
13546
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013547static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013548_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013549{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013550 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13551 writer->data = PyUnicode_DATA(writer->buffer);
13552
13553 if (!writer->readonly) {
13554 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013555 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013556 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013557 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013558 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13559 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13560 writer->kind = PyUnicode_WCHAR_KIND;
13561 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13562
Victor Stinner8f674cc2013-04-17 23:02:17 +020013563 /* Copy-on-write mode: set buffer size to 0 so
13564 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13565 * next write. */
13566 writer->size = 0;
13567 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013568}
13569
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013571_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013572{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013573 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013574
13575 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013576 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013577
13578 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13579 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13580 writer->kind = PyUnicode_WCHAR_KIND;
13581 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013582}
13583
Inada Naoki770847a2019-06-24 12:30:24 +090013584// Initialize _PyUnicodeWriter with initial buffer
13585static inline void
13586_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13587{
13588 memset(writer, 0, sizeof(*writer));
13589 writer->buffer = buffer;
13590 _PyUnicodeWriter_Update(writer);
13591 writer->min_length = writer->size;
13592}
13593
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594int
13595_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13596 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013597{
13598 Py_ssize_t newlen;
13599 PyObject *newbuffer;
13600
Victor Stinner2740e462016-09-06 16:58:36 -070013601 assert(maxchar <= MAX_UNICODE);
13602
Victor Stinnerca9381e2015-09-22 00:58:32 +020013603 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013604 assert((maxchar > writer->maxchar && length >= 0)
13605 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013606
Victor Stinner202fdca2012-05-07 12:47:02 +020013607 if (length > PY_SSIZE_T_MAX - writer->pos) {
13608 PyErr_NoMemory();
13609 return -1;
13610 }
13611 newlen = writer->pos + length;
13612
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013613 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013614
Victor Stinnerd3f08822012-05-29 12:57:52 +020013615 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013616 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013617 if (writer->overallocate
13618 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13619 /* overallocate to limit the number of realloc() */
13620 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013621 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013622 if (newlen < writer->min_length)
13623 newlen = writer->min_length;
13624
Victor Stinnerd3f08822012-05-29 12:57:52 +020013625 writer->buffer = PyUnicode_New(newlen, maxchar);
13626 if (writer->buffer == NULL)
13627 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013629 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013630 if (writer->overallocate
13631 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13632 /* overallocate to limit the number of realloc() */
13633 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013634 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013635 if (newlen < writer->min_length)
13636 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013637
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013638 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013639 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013640 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013641 newbuffer = PyUnicode_New(newlen, maxchar);
13642 if (newbuffer == NULL)
13643 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013644 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13645 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013646 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013647 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013648 }
13649 else {
13650 newbuffer = resize_compact(writer->buffer, newlen);
13651 if (newbuffer == NULL)
13652 return -1;
13653 }
13654 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013655 }
13656 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013657 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658 newbuffer = PyUnicode_New(writer->size, maxchar);
13659 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013660 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013661 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13662 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013663 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013664 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013665 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013666 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013667
13668#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013669}
13670
Victor Stinnerca9381e2015-09-22 00:58:32 +020013671int
13672_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13673 enum PyUnicode_Kind kind)
13674{
13675 Py_UCS4 maxchar;
13676
13677 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13678 assert(writer->kind < kind);
13679
13680 switch (kind)
13681 {
13682 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13683 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13684 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13685 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013686 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013687 }
13688
13689 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13690}
13691
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013692static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013693_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013694{
Victor Stinner2740e462016-09-06 16:58:36 -070013695 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013696 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13697 return -1;
13698 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13699 writer->pos++;
13700 return 0;
13701}
13702
13703int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013704_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13705{
13706 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13707}
13708
13709int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013710_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13711{
13712 Py_UCS4 maxchar;
13713 Py_ssize_t len;
13714
13715 if (PyUnicode_READY(str) == -1)
13716 return -1;
13717 len = PyUnicode_GET_LENGTH(str);
13718 if (len == 0)
13719 return 0;
13720 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13721 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013722 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013723 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013724 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013725 Py_INCREF(str);
13726 writer->buffer = str;
13727 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013728 writer->pos += len;
13729 return 0;
13730 }
13731 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13732 return -1;
13733 }
13734 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13735 str, 0, len);
13736 writer->pos += len;
13737 return 0;
13738}
13739
Victor Stinnere215d962012-10-06 23:03:36 +020013740int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013741_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13742 Py_ssize_t start, Py_ssize_t end)
13743{
13744 Py_UCS4 maxchar;
13745 Py_ssize_t len;
13746
13747 if (PyUnicode_READY(str) == -1)
13748 return -1;
13749
13750 assert(0 <= start);
13751 assert(end <= PyUnicode_GET_LENGTH(str));
13752 assert(start <= end);
13753
13754 if (end == 0)
13755 return 0;
13756
13757 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13758 return _PyUnicodeWriter_WriteStr(writer, str);
13759
13760 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13761 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13762 else
13763 maxchar = writer->maxchar;
13764 len = end - start;
13765
13766 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13767 return -1;
13768
13769 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13770 str, start, len);
13771 writer->pos += len;
13772 return 0;
13773}
13774
13775int
Victor Stinner4a587072013-11-19 12:54:53 +010013776_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13777 const char *ascii, Py_ssize_t len)
13778{
13779 if (len == -1)
13780 len = strlen(ascii);
13781
13782 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13783
13784 if (writer->buffer == NULL && !writer->overallocate) {
13785 PyObject *str;
13786
13787 str = _PyUnicode_FromASCII(ascii, len);
13788 if (str == NULL)
13789 return -1;
13790
13791 writer->readonly = 1;
13792 writer->buffer = str;
13793 _PyUnicodeWriter_Update(writer);
13794 writer->pos += len;
13795 return 0;
13796 }
13797
13798 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13799 return -1;
13800
13801 switch (writer->kind)
13802 {
13803 case PyUnicode_1BYTE_KIND:
13804 {
13805 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13806 Py_UCS1 *data = writer->data;
13807
Christian Heimesf051e432016-09-13 20:22:02 +020013808 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013809 break;
13810 }
13811 case PyUnicode_2BYTE_KIND:
13812 {
13813 _PyUnicode_CONVERT_BYTES(
13814 Py_UCS1, Py_UCS2,
13815 ascii, ascii + len,
13816 (Py_UCS2 *)writer->data + writer->pos);
13817 break;
13818 }
13819 case PyUnicode_4BYTE_KIND:
13820 {
13821 _PyUnicode_CONVERT_BYTES(
13822 Py_UCS1, Py_UCS4,
13823 ascii, ascii + len,
13824 (Py_UCS4 *)writer->data + writer->pos);
13825 break;
13826 }
13827 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013828 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013829 }
13830
13831 writer->pos += len;
13832 return 0;
13833}
13834
13835int
13836_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13837 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013838{
13839 Py_UCS4 maxchar;
13840
13841 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13842 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13843 return -1;
13844 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13845 writer->pos += len;
13846 return 0;
13847}
13848
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013850_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013851{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013852 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013853
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013855 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013856 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013858
13859 str = writer->buffer;
13860 writer->buffer = NULL;
13861
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013862 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013863 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13864 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013865 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013866
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013867 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13868 PyObject *str2;
13869 str2 = resize_compact(str, writer->pos);
13870 if (str2 == NULL) {
13871 Py_DECREF(str);
13872 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013873 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013874 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013875 }
13876
Victor Stinner15a0bd32013-07-08 22:29:55 +020013877 assert(_PyUnicode_CheckConsistency(str, 1));
13878 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013879}
13880
Victor Stinnerd3f08822012-05-29 12:57:52 +020013881void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013882_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013883{
13884 Py_CLEAR(writer->buffer);
13885}
13886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013887#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013888
13889PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013891\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013892Return a formatted version of S, using substitutions from args and kwargs.\n\
13893The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013894
Eric Smith27bbca62010-11-04 17:06:58 +000013895PyDoc_STRVAR(format_map__doc__,
13896 "S.format_map(mapping) -> str\n\
13897\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013898Return a formatted version of S, using substitutions from mapping.\n\
13899The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013900
INADA Naoki3ae20562017-01-16 20:41:20 +090013901/*[clinic input]
13902str.__format__ as unicode___format__
13903
13904 format_spec: unicode
13905 /
13906
13907Return a formatted version of the string as described by format_spec.
13908[clinic start generated code]*/
13909
Eric Smith4a7d76d2008-05-30 18:10:19 +000013910static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013911unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013912/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013913{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013914 _PyUnicodeWriter writer;
13915 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013916
Victor Stinnerd3f08822012-05-29 12:57:52 +020013917 if (PyUnicode_READY(self) == -1)
13918 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013919 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013920 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13921 self, format_spec, 0,
13922 PyUnicode_GET_LENGTH(format_spec));
13923 if (ret == -1) {
13924 _PyUnicodeWriter_Dealloc(&writer);
13925 return NULL;
13926 }
13927 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013928}
13929
INADA Naoki3ae20562017-01-16 20:41:20 +090013930/*[clinic input]
13931str.__sizeof__ as unicode_sizeof
13932
13933Return the size of the string in memory, in bytes.
13934[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013935
13936static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013937unicode_sizeof_impl(PyObject *self)
13938/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013940 Py_ssize_t size;
13941
13942 /* If it's a compact object, account for base structure +
13943 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013944 if (PyUnicode_IS_COMPACT_ASCII(self))
13945 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13946 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013948 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 else {
13950 /* If it is a two-block object, account for base object, and
13951 for character block if present. */
13952 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013953 if (_PyUnicode_DATA_ANY(self))
13954 size += (PyUnicode_GET_LENGTH(self) + 1) *
13955 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013956 }
13957 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013958 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013959 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13960 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13961 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13962 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963
13964 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013965}
13966
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013967static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013968unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013969{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013970 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013971 if (!copy)
13972 return NULL;
13973 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013974}
13975
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013977 UNICODE_ENCODE_METHODDEF
13978 UNICODE_REPLACE_METHODDEF
13979 UNICODE_SPLIT_METHODDEF
13980 UNICODE_RSPLIT_METHODDEF
13981 UNICODE_JOIN_METHODDEF
13982 UNICODE_CAPITALIZE_METHODDEF
13983 UNICODE_CASEFOLD_METHODDEF
13984 UNICODE_TITLE_METHODDEF
13985 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013986 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013987 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013988 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013989 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013990 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013991 UNICODE_LJUST_METHODDEF
13992 UNICODE_LOWER_METHODDEF
13993 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013994 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13995 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013996 UNICODE_RJUST_METHODDEF
13997 UNICODE_RSTRIP_METHODDEF
13998 UNICODE_RPARTITION_METHODDEF
13999 UNICODE_SPLITLINES_METHODDEF
14000 UNICODE_STRIP_METHODDEF
14001 UNICODE_SWAPCASE_METHODDEF
14002 UNICODE_TRANSLATE_METHODDEF
14003 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014004 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14005 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014006 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014007 UNICODE_ISLOWER_METHODDEF
14008 UNICODE_ISUPPER_METHODDEF
14009 UNICODE_ISTITLE_METHODDEF
14010 UNICODE_ISSPACE_METHODDEF
14011 UNICODE_ISDECIMAL_METHODDEF
14012 UNICODE_ISDIGIT_METHODDEF
14013 UNICODE_ISNUMERIC_METHODDEF
14014 UNICODE_ISALPHA_METHODDEF
14015 UNICODE_ISALNUM_METHODDEF
14016 UNICODE_ISIDENTIFIER_METHODDEF
14017 UNICODE_ISPRINTABLE_METHODDEF
14018 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014019 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014020 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014021 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014022 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014023 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014024#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014025 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014026 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014027#endif
14028
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014029 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014030 {NULL, NULL}
14031};
14032
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014033static PyObject *
14034unicode_mod(PyObject *v, PyObject *w)
14035{
Brian Curtindfc80e32011-08-10 20:28:54 -050014036 if (!PyUnicode_Check(v))
14037 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014038 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014039}
14040
14041static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 0, /*nb_add*/
14043 0, /*nb_subtract*/
14044 0, /*nb_multiply*/
14045 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014046};
14047
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 (lenfunc) unicode_length, /* sq_length */
14050 PyUnicode_Concat, /* sq_concat */
14051 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14052 (ssizeargfunc) unicode_getitem, /* sq_item */
14053 0, /* sq_slice */
14054 0, /* sq_ass_item */
14055 0, /* sq_ass_slice */
14056 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014057};
14058
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014059static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014060unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014062 if (PyUnicode_READY(self) == -1)
14063 return NULL;
14064
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014065 if (PyIndex_Check(item)) {
14066 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014067 if (i == -1 && PyErr_Occurred())
14068 return NULL;
14069 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014070 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014071 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014072 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014073 Py_ssize_t start, stop, step, slicelength, i;
14074 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014075 PyObject *result;
14076 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014077 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014078 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014079
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014080 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014081 return NULL;
14082 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014083 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14084 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014085
14086 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014087 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014088 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014089 slicelength == PyUnicode_GET_LENGTH(self)) {
14090 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014091 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014092 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014093 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014094 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014095 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014096 src_kind = PyUnicode_KIND(self);
14097 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014098 if (!PyUnicode_IS_ASCII(self)) {
14099 kind_limit = kind_maxchar_limit(src_kind);
14100 max_char = 0;
14101 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14102 ch = PyUnicode_READ(src_kind, src_data, cur);
14103 if (ch > max_char) {
14104 max_char = ch;
14105 if (max_char >= kind_limit)
14106 break;
14107 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014108 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014109 }
Victor Stinner55c99112011-10-13 01:17:06 +020014110 else
14111 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014112 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014113 if (result == NULL)
14114 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014115 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014116 dest_data = PyUnicode_DATA(result);
14117
14118 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014119 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14120 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014121 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014122 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014123 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014124 } else {
14125 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14126 return NULL;
14127 }
14128}
14129
14130static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 (lenfunc)unicode_length, /* mp_length */
14132 (binaryfunc)unicode_subscript, /* mp_subscript */
14133 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014134};
14135
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136
Guido van Rossumd57fd912000-03-10 22:53:23 +000014137/* Helpers for PyUnicode_Format() */
14138
Victor Stinnera47082312012-10-04 02:19:54 +020014139struct unicode_formatter_t {
14140 PyObject *args;
14141 int args_owned;
14142 Py_ssize_t arglen, argidx;
14143 PyObject *dict;
14144
14145 enum PyUnicode_Kind fmtkind;
14146 Py_ssize_t fmtcnt, fmtpos;
14147 void *fmtdata;
14148 PyObject *fmtstr;
14149
14150 _PyUnicodeWriter writer;
14151};
14152
14153struct unicode_format_arg_t {
14154 Py_UCS4 ch;
14155 int flags;
14156 Py_ssize_t width;
14157 int prec;
14158 int sign;
14159};
14160
Guido van Rossumd57fd912000-03-10 22:53:23 +000014161static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014162unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014163{
Victor Stinnera47082312012-10-04 02:19:54 +020014164 Py_ssize_t argidx = ctx->argidx;
14165
14166 if (argidx < ctx->arglen) {
14167 ctx->argidx++;
14168 if (ctx->arglen < 0)
14169 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014170 else
Victor Stinnera47082312012-10-04 02:19:54 +020014171 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014172 }
14173 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175 return NULL;
14176}
14177
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014178/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179
Victor Stinnera47082312012-10-04 02:19:54 +020014180/* Format a float into the writer if the writer is not NULL, or into *p_output
14181 otherwise.
14182
14183 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014184static int
Victor Stinnera47082312012-10-04 02:19:54 +020014185formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14186 PyObject **p_output,
14187 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014188{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014189 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014191 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014192 int prec;
14193 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014194
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195 x = PyFloat_AsDouble(v);
14196 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014197 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014198
Victor Stinnera47082312012-10-04 02:19:54 +020014199 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014200 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014201 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014202
Victor Stinnera47082312012-10-04 02:19:54 +020014203 if (arg->flags & F_ALT)
14204 dtoa_flags = Py_DTSF_ALT;
14205 else
14206 dtoa_flags = 0;
14207 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014208 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014209 return -1;
14210 len = strlen(p);
14211 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014212 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014213 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014214 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014215 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014216 }
14217 else
14218 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014219 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014220 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014221}
14222
Victor Stinnerd0880d52012-04-27 23:40:13 +020014223/* formatlong() emulates the format codes d, u, o, x and X, and
14224 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14225 * Python's regular ints.
14226 * Return value: a new PyUnicodeObject*, or NULL if error.
14227 * The output string is of the form
14228 * "-"? ("0x" | "0X")? digit+
14229 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14230 * set in flags. The case of hex digits will be correct,
14231 * There will be at least prec digits, zero-filled on the left if
14232 * necessary to get that many.
14233 * val object to be converted
14234 * flags bitmask of format flags; only F_ALT is looked at
14235 * prec minimum number of digits; 0-fill on left if needed
14236 * type a character in [duoxX]; u acts the same as d
14237 *
14238 * CAUTION: o, x and X conversions on regular ints can never
14239 * produce a '-' sign, but can for Python's unbounded ints.
14240 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014241PyObject *
14242_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014243{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014244 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014246 Py_ssize_t i;
14247 int sign; /* 1 if '-', else 0 */
14248 int len; /* number of characters */
14249 Py_ssize_t llen;
14250 int numdigits; /* len == numnondigits + numdigits */
14251 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014252
Victor Stinnerd0880d52012-04-27 23:40:13 +020014253 /* Avoid exceeding SSIZE_T_MAX */
14254 if (prec > INT_MAX-3) {
14255 PyErr_SetString(PyExc_OverflowError,
14256 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014257 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014258 }
14259
14260 assert(PyLong_Check(val));
14261
14262 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014263 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014264 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014265 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014266 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014267 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014268 /* int and int subclasses should print numerically when a numeric */
14269 /* format code is used (see issue18780) */
14270 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014271 break;
14272 case 'o':
14273 numnondigits = 2;
14274 result = PyNumber_ToBase(val, 8);
14275 break;
14276 case 'x':
14277 case 'X':
14278 numnondigits = 2;
14279 result = PyNumber_ToBase(val, 16);
14280 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014281 }
14282 if (!result)
14283 return NULL;
14284
14285 assert(unicode_modifiable(result));
14286 assert(PyUnicode_IS_READY(result));
14287 assert(PyUnicode_IS_ASCII(result));
14288
14289 /* To modify the string in-place, there can only be one reference. */
14290 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014291 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014292 PyErr_BadInternalCall();
14293 return NULL;
14294 }
14295 buf = PyUnicode_DATA(result);
14296 llen = PyUnicode_GET_LENGTH(result);
14297 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014298 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014299 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014300 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014301 return NULL;
14302 }
14303 len = (int)llen;
14304 sign = buf[0] == '-';
14305 numnondigits += sign;
14306 numdigits = len - numnondigits;
14307 assert(numdigits > 0);
14308
14309 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014310 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014311 (type == 'o' || type == 'x' || type == 'X'))) {
14312 assert(buf[sign] == '0');
14313 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14314 buf[sign+1] == 'o');
14315 numnondigits -= 2;
14316 buf += 2;
14317 len -= 2;
14318 if (sign)
14319 buf[0] = '-';
14320 assert(len == numnondigits + numdigits);
14321 assert(numdigits > 0);
14322 }
14323
14324 /* Fill with leading zeroes to meet minimum width. */
14325 if (prec > numdigits) {
14326 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14327 numnondigits + prec);
14328 char *b1;
14329 if (!r1) {
14330 Py_DECREF(result);
14331 return NULL;
14332 }
14333 b1 = PyBytes_AS_STRING(r1);
14334 for (i = 0; i < numnondigits; ++i)
14335 *b1++ = *buf++;
14336 for (i = 0; i < prec - numdigits; i++)
14337 *b1++ = '0';
14338 for (i = 0; i < numdigits; i++)
14339 *b1++ = *buf++;
14340 *b1 = '\0';
14341 Py_DECREF(result);
14342 result = r1;
14343 buf = PyBytes_AS_STRING(result);
14344 len = numnondigits + prec;
14345 }
14346
14347 /* Fix up case for hex conversions. */
14348 if (type == 'X') {
14349 /* Need to convert all lower case letters to upper case.
14350 and need to convert 0x to 0X (and -0x to -0X). */
14351 for (i = 0; i < len; i++)
14352 if (buf[i] >= 'a' && buf[i] <= 'x')
14353 buf[i] -= 'a'-'A';
14354 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014355 if (!PyUnicode_Check(result)
14356 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014357 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014358 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014359 Py_DECREF(result);
14360 result = unicode;
14361 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014362 else if (len != PyUnicode_GET_LENGTH(result)) {
14363 if (PyUnicode_Resize(&result, len) < 0)
14364 Py_CLEAR(result);
14365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014367}
14368
Ethan Furmandf3ed242014-01-05 06:50:30 -080014369/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014370 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014371 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014372 * -1 and raise an exception on error */
14373static int
Victor Stinnera47082312012-10-04 02:19:54 +020014374mainformatlong(PyObject *v,
14375 struct unicode_format_arg_t *arg,
14376 PyObject **p_output,
14377 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014378{
14379 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014380 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014381
14382 if (!PyNumber_Check(v))
14383 goto wrongtype;
14384
Ethan Furman9ab74802014-03-21 06:38:46 -070014385 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014386 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014387 if (type == 'o' || type == 'x' || type == 'X') {
14388 iobj = PyNumber_Index(v);
14389 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014390 if (PyErr_ExceptionMatches(PyExc_TypeError))
14391 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014392 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393 }
14394 }
14395 else {
14396 iobj = PyNumber_Long(v);
14397 if (iobj == NULL ) {
14398 if (PyErr_ExceptionMatches(PyExc_TypeError))
14399 goto wrongtype;
14400 return -1;
14401 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014402 }
14403 assert(PyLong_Check(iobj));
14404 }
14405 else {
14406 iobj = v;
14407 Py_INCREF(iobj);
14408 }
14409
14410 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014411 && arg->width == -1 && arg->prec == -1
14412 && !(arg->flags & (F_SIGN | F_BLANK))
14413 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014414 {
14415 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014416 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014417 int base;
14418
Victor Stinnera47082312012-10-04 02:19:54 +020014419 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014420 {
14421 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014422 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014423 case 'd':
14424 case 'i':
14425 case 'u':
14426 base = 10;
14427 break;
14428 case 'o':
14429 base = 8;
14430 break;
14431 case 'x':
14432 case 'X':
14433 base = 16;
14434 break;
14435 }
14436
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014437 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14438 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014439 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014440 }
14441 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014442 return 1;
14443 }
14444
Ethan Furmanb95b5612015-01-23 20:05:18 -080014445 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014446 Py_DECREF(iobj);
14447 if (res == NULL)
14448 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014449 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014450 return 0;
14451
14452wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014453 switch(type)
14454 {
14455 case 'o':
14456 case 'x':
14457 case 'X':
14458 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014459 "%%%c format: an integer is required, "
14460 "not %.200s",
14461 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014462 break;
14463 default:
14464 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014465 "%%%c format: a number is required, "
14466 "not %.200s",
14467 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014468 break;
14469 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014470 return -1;
14471}
14472
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014473static Py_UCS4
14474formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014475{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014476 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014477 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014478 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014479 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014480 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014481 goto onError;
14482 }
14483 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014484 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014485 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014486 /* make sure number is a type of integer */
14487 if (!PyLong_Check(v)) {
14488 iobj = PyNumber_Index(v);
14489 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014490 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014491 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014492 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014493 Py_DECREF(iobj);
14494 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014495 else {
14496 x = PyLong_AsLong(v);
14497 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014498 if (x == -1 && PyErr_Occurred())
14499 goto onError;
14500
Victor Stinner8faf8212011-12-08 22:14:11 +010014501 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014502 PyErr_SetString(PyExc_OverflowError,
14503 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014504 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014505 }
14506
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014507 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014508 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014509
Benjamin Peterson29060642009-01-31 22:14:21 +000014510 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014511 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014512 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014513 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014514}
14515
Victor Stinnera47082312012-10-04 02:19:54 +020014516/* Parse options of an argument: flags, width, precision.
14517 Handle also "%(name)" syntax.
14518
14519 Return 0 if the argument has been formatted into arg->str.
14520 Return 1 if the argument has been written into ctx->writer,
14521 Raise an exception and return -1 on error. */
14522static int
14523unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14524 struct unicode_format_arg_t *arg)
14525{
14526#define FORMAT_READ(ctx) \
14527 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14528
14529 PyObject *v;
14530
Victor Stinnera47082312012-10-04 02:19:54 +020014531 if (arg->ch == '(') {
14532 /* Get argument value from a dictionary. Example: "%(name)s". */
14533 Py_ssize_t keystart;
14534 Py_ssize_t keylen;
14535 PyObject *key;
14536 int pcount = 1;
14537
14538 if (ctx->dict == NULL) {
14539 PyErr_SetString(PyExc_TypeError,
14540 "format requires a mapping");
14541 return -1;
14542 }
14543 ++ctx->fmtpos;
14544 --ctx->fmtcnt;
14545 keystart = ctx->fmtpos;
14546 /* Skip over balanced parentheses */
14547 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14548 arg->ch = FORMAT_READ(ctx);
14549 if (arg->ch == ')')
14550 --pcount;
14551 else if (arg->ch == '(')
14552 ++pcount;
14553 ctx->fmtpos++;
14554 }
14555 keylen = ctx->fmtpos - keystart - 1;
14556 if (ctx->fmtcnt < 0 || pcount > 0) {
14557 PyErr_SetString(PyExc_ValueError,
14558 "incomplete format key");
14559 return -1;
14560 }
14561 key = PyUnicode_Substring(ctx->fmtstr,
14562 keystart, keystart + keylen);
14563 if (key == NULL)
14564 return -1;
14565 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014566 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014567 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014568 }
14569 ctx->args = PyObject_GetItem(ctx->dict, key);
14570 Py_DECREF(key);
14571 if (ctx->args == NULL)
14572 return -1;
14573 ctx->args_owned = 1;
14574 ctx->arglen = -1;
14575 ctx->argidx = -2;
14576 }
14577
14578 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014579 while (--ctx->fmtcnt >= 0) {
14580 arg->ch = FORMAT_READ(ctx);
14581 ctx->fmtpos++;
14582 switch (arg->ch) {
14583 case '-': arg->flags |= F_LJUST; continue;
14584 case '+': arg->flags |= F_SIGN; continue;
14585 case ' ': arg->flags |= F_BLANK; continue;
14586 case '#': arg->flags |= F_ALT; continue;
14587 case '0': arg->flags |= F_ZERO; continue;
14588 }
14589 break;
14590 }
14591
14592 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014593 if (arg->ch == '*') {
14594 v = unicode_format_getnextarg(ctx);
14595 if (v == NULL)
14596 return -1;
14597 if (!PyLong_Check(v)) {
14598 PyErr_SetString(PyExc_TypeError,
14599 "* wants int");
14600 return -1;
14601 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014602 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014603 if (arg->width == -1 && PyErr_Occurred())
14604 return -1;
14605 if (arg->width < 0) {
14606 arg->flags |= F_LJUST;
14607 arg->width = -arg->width;
14608 }
14609 if (--ctx->fmtcnt >= 0) {
14610 arg->ch = FORMAT_READ(ctx);
14611 ctx->fmtpos++;
14612 }
14613 }
14614 else if (arg->ch >= '0' && arg->ch <= '9') {
14615 arg->width = arg->ch - '0';
14616 while (--ctx->fmtcnt >= 0) {
14617 arg->ch = FORMAT_READ(ctx);
14618 ctx->fmtpos++;
14619 if (arg->ch < '0' || arg->ch > '9')
14620 break;
14621 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14622 mixing signed and unsigned comparison. Since arg->ch is between
14623 '0' and '9', casting to int is safe. */
14624 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14625 PyErr_SetString(PyExc_ValueError,
14626 "width too big");
14627 return -1;
14628 }
14629 arg->width = arg->width*10 + (arg->ch - '0');
14630 }
14631 }
14632
14633 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014634 if (arg->ch == '.') {
14635 arg->prec = 0;
14636 if (--ctx->fmtcnt >= 0) {
14637 arg->ch = FORMAT_READ(ctx);
14638 ctx->fmtpos++;
14639 }
14640 if (arg->ch == '*') {
14641 v = unicode_format_getnextarg(ctx);
14642 if (v == NULL)
14643 return -1;
14644 if (!PyLong_Check(v)) {
14645 PyErr_SetString(PyExc_TypeError,
14646 "* wants int");
14647 return -1;
14648 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014649 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014650 if (arg->prec == -1 && PyErr_Occurred())
14651 return -1;
14652 if (arg->prec < 0)
14653 arg->prec = 0;
14654 if (--ctx->fmtcnt >= 0) {
14655 arg->ch = FORMAT_READ(ctx);
14656 ctx->fmtpos++;
14657 }
14658 }
14659 else if (arg->ch >= '0' && arg->ch <= '9') {
14660 arg->prec = arg->ch - '0';
14661 while (--ctx->fmtcnt >= 0) {
14662 arg->ch = FORMAT_READ(ctx);
14663 ctx->fmtpos++;
14664 if (arg->ch < '0' || arg->ch > '9')
14665 break;
14666 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14667 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014668 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014669 return -1;
14670 }
14671 arg->prec = arg->prec*10 + (arg->ch - '0');
14672 }
14673 }
14674 }
14675
14676 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14677 if (ctx->fmtcnt >= 0) {
14678 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14679 if (--ctx->fmtcnt >= 0) {
14680 arg->ch = FORMAT_READ(ctx);
14681 ctx->fmtpos++;
14682 }
14683 }
14684 }
14685 if (ctx->fmtcnt < 0) {
14686 PyErr_SetString(PyExc_ValueError,
14687 "incomplete format");
14688 return -1;
14689 }
14690 return 0;
14691
14692#undef FORMAT_READ
14693}
14694
14695/* Format one argument. Supported conversion specifiers:
14696
14697 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014698 - "i", "d", "u": int or float
14699 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014700 - "e", "E", "f", "F", "g", "G": float
14701 - "c": int or str (1 character)
14702
Victor Stinner8dbd4212012-12-04 09:30:24 +010014703 When possible, the output is written directly into the Unicode writer
14704 (ctx->writer). A string is created when padding is required.
14705
Victor Stinnera47082312012-10-04 02:19:54 +020014706 Return 0 if the argument has been formatted into *p_str,
14707 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014708 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014709static int
14710unicode_format_arg_format(struct unicode_formatter_t *ctx,
14711 struct unicode_format_arg_t *arg,
14712 PyObject **p_str)
14713{
14714 PyObject *v;
14715 _PyUnicodeWriter *writer = &ctx->writer;
14716
14717 if (ctx->fmtcnt == 0)
14718 ctx->writer.overallocate = 0;
14719
Victor Stinnera47082312012-10-04 02:19:54 +020014720 v = unicode_format_getnextarg(ctx);
14721 if (v == NULL)
14722 return -1;
14723
Victor Stinnera47082312012-10-04 02:19:54 +020014724
14725 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014726 case 's':
14727 case 'r':
14728 case 'a':
14729 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14730 /* Fast path */
14731 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14732 return -1;
14733 return 1;
14734 }
14735
14736 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14737 *p_str = v;
14738 Py_INCREF(*p_str);
14739 }
14740 else {
14741 if (arg->ch == 's')
14742 *p_str = PyObject_Str(v);
14743 else if (arg->ch == 'r')
14744 *p_str = PyObject_Repr(v);
14745 else
14746 *p_str = PyObject_ASCII(v);
14747 }
14748 break;
14749
14750 case 'i':
14751 case 'd':
14752 case 'u':
14753 case 'o':
14754 case 'x':
14755 case 'X':
14756 {
14757 int ret = mainformatlong(v, arg, p_str, writer);
14758 if (ret != 0)
14759 return ret;
14760 arg->sign = 1;
14761 break;
14762 }
14763
14764 case 'e':
14765 case 'E':
14766 case 'f':
14767 case 'F':
14768 case 'g':
14769 case 'G':
14770 if (arg->width == -1 && arg->prec == -1
14771 && !(arg->flags & (F_SIGN | F_BLANK)))
14772 {
14773 /* Fast path */
14774 if (formatfloat(v, arg, NULL, writer) == -1)
14775 return -1;
14776 return 1;
14777 }
14778
14779 arg->sign = 1;
14780 if (formatfloat(v, arg, p_str, NULL) == -1)
14781 return -1;
14782 break;
14783
14784 case 'c':
14785 {
14786 Py_UCS4 ch = formatchar(v);
14787 if (ch == (Py_UCS4) -1)
14788 return -1;
14789 if (arg->width == -1 && arg->prec == -1) {
14790 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014791 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014792 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014793 return 1;
14794 }
14795 *p_str = PyUnicode_FromOrdinal(ch);
14796 break;
14797 }
14798
14799 default:
14800 PyErr_Format(PyExc_ValueError,
14801 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014802 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014803 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14804 (int)arg->ch,
14805 ctx->fmtpos - 1);
14806 return -1;
14807 }
14808 if (*p_str == NULL)
14809 return -1;
14810 assert (PyUnicode_Check(*p_str));
14811 return 0;
14812}
14813
14814static int
14815unicode_format_arg_output(struct unicode_formatter_t *ctx,
14816 struct unicode_format_arg_t *arg,
14817 PyObject *str)
14818{
14819 Py_ssize_t len;
14820 enum PyUnicode_Kind kind;
14821 void *pbuf;
14822 Py_ssize_t pindex;
14823 Py_UCS4 signchar;
14824 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014825 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014826 Py_ssize_t sublen;
14827 _PyUnicodeWriter *writer = &ctx->writer;
14828 Py_UCS4 fill;
14829
14830 fill = ' ';
14831 if (arg->sign && arg->flags & F_ZERO)
14832 fill = '0';
14833
14834 if (PyUnicode_READY(str) == -1)
14835 return -1;
14836
14837 len = PyUnicode_GET_LENGTH(str);
14838 if ((arg->width == -1 || arg->width <= len)
14839 && (arg->prec == -1 || arg->prec >= len)
14840 && !(arg->flags & (F_SIGN | F_BLANK)))
14841 {
14842 /* Fast path */
14843 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14844 return -1;
14845 return 0;
14846 }
14847
14848 /* Truncate the string for "s", "r" and "a" formats
14849 if the precision is set */
14850 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14851 if (arg->prec >= 0 && len > arg->prec)
14852 len = arg->prec;
14853 }
14854
14855 /* Adjust sign and width */
14856 kind = PyUnicode_KIND(str);
14857 pbuf = PyUnicode_DATA(str);
14858 pindex = 0;
14859 signchar = '\0';
14860 if (arg->sign) {
14861 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14862 if (ch == '-' || ch == '+') {
14863 signchar = ch;
14864 len--;
14865 pindex++;
14866 }
14867 else if (arg->flags & F_SIGN)
14868 signchar = '+';
14869 else if (arg->flags & F_BLANK)
14870 signchar = ' ';
14871 else
14872 arg->sign = 0;
14873 }
14874 if (arg->width < len)
14875 arg->width = len;
14876
14877 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014878 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014879 if (!(arg->flags & F_LJUST)) {
14880 if (arg->sign) {
14881 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014882 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014883 }
14884 else {
14885 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014886 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014887 }
14888 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014889 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14890 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014891 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014892 }
14893
Victor Stinnera47082312012-10-04 02:19:54 +020014894 buflen = arg->width;
14895 if (arg->sign && len == arg->width)
14896 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014897 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014898 return -1;
14899
14900 /* Write the sign if needed */
14901 if (arg->sign) {
14902 if (fill != ' ') {
14903 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14904 writer->pos += 1;
14905 }
14906 if (arg->width > len)
14907 arg->width--;
14908 }
14909
14910 /* Write the numeric prefix for "x", "X" and "o" formats
14911 if the alternate form is used.
14912 For example, write "0x" for the "%#x" format. */
14913 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14914 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14915 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14916 if (fill != ' ') {
14917 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14918 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14919 writer->pos += 2;
14920 pindex += 2;
14921 }
14922 arg->width -= 2;
14923 if (arg->width < 0)
14924 arg->width = 0;
14925 len -= 2;
14926 }
14927
14928 /* Pad left with the fill character if needed */
14929 if (arg->width > len && !(arg->flags & F_LJUST)) {
14930 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014931 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014932 writer->pos += sublen;
14933 arg->width = len;
14934 }
14935
14936 /* If padding with spaces: write sign if needed and/or numeric prefix if
14937 the alternate form is used */
14938 if (fill == ' ') {
14939 if (arg->sign) {
14940 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14941 writer->pos += 1;
14942 }
14943 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14944 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14945 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14946 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14947 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14948 writer->pos += 2;
14949 pindex += 2;
14950 }
14951 }
14952
14953 /* Write characters */
14954 if (len) {
14955 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14956 str, pindex, len);
14957 writer->pos += len;
14958 }
14959
14960 /* Pad right with the fill character if needed */
14961 if (arg->width > len) {
14962 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014963 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014964 writer->pos += sublen;
14965 }
14966 return 0;
14967}
14968
14969/* Helper of PyUnicode_Format(): format one arg.
14970 Return 0 on success, raise an exception and return -1 on error. */
14971static int
14972unicode_format_arg(struct unicode_formatter_t *ctx)
14973{
14974 struct unicode_format_arg_t arg;
14975 PyObject *str;
14976 int ret;
14977
Victor Stinner8dbd4212012-12-04 09:30:24 +010014978 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014979 if (arg.ch == '%') {
14980 ctx->fmtpos++;
14981 ctx->fmtcnt--;
14982 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14983 return -1;
14984 return 0;
14985 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014986 arg.flags = 0;
14987 arg.width = -1;
14988 arg.prec = -1;
14989 arg.sign = 0;
14990 str = NULL;
14991
Victor Stinnera47082312012-10-04 02:19:54 +020014992 ret = unicode_format_arg_parse(ctx, &arg);
14993 if (ret == -1)
14994 return -1;
14995
14996 ret = unicode_format_arg_format(ctx, &arg, &str);
14997 if (ret == -1)
14998 return -1;
14999
15000 if (ret != 1) {
15001 ret = unicode_format_arg_output(ctx, &arg, str);
15002 Py_DECREF(str);
15003 if (ret == -1)
15004 return -1;
15005 }
15006
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015007 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015008 PyErr_SetString(PyExc_TypeError,
15009 "not all arguments converted during string formatting");
15010 return -1;
15011 }
15012 return 0;
15013}
15014
Alexander Belopolsky40018472011-02-26 01:02:56 +000015015PyObject *
15016PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017{
Victor Stinnera47082312012-10-04 02:19:54 +020015018 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015019
Guido van Rossumd57fd912000-03-10 22:53:23 +000015020 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015021 PyErr_BadInternalCall();
15022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023 }
Victor Stinnera47082312012-10-04 02:19:54 +020015024
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015025 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015026 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015027
15028 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015029 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15030 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15031 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15032 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015033
Victor Stinner8f674cc2013-04-17 23:02:17 +020015034 _PyUnicodeWriter_Init(&ctx.writer);
15035 ctx.writer.min_length = ctx.fmtcnt + 100;
15036 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015037
Guido van Rossumd57fd912000-03-10 22:53:23 +000015038 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015039 ctx.arglen = PyTuple_Size(args);
15040 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015041 }
15042 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015043 ctx.arglen = -1;
15044 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015045 }
Victor Stinnera47082312012-10-04 02:19:54 +020015046 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015047 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015048 ctx.dict = args;
15049 else
15050 ctx.dict = NULL;
15051 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015052
Victor Stinnera47082312012-10-04 02:19:54 +020015053 while (--ctx.fmtcnt >= 0) {
15054 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015055 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015056
15057 nonfmtpos = ctx.fmtpos++;
15058 while (ctx.fmtcnt >= 0 &&
15059 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15060 ctx.fmtpos++;
15061 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015062 }
Victor Stinnera47082312012-10-04 02:19:54 +020015063 if (ctx.fmtcnt < 0) {
15064 ctx.fmtpos--;
15065 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015066 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015067
Victor Stinnercfc4c132013-04-03 01:48:39 +020015068 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15069 nonfmtpos, ctx.fmtpos) < 0)
15070 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015071 }
15072 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015073 ctx.fmtpos++;
15074 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015075 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015076 }
15077 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015078
Victor Stinnera47082312012-10-04 02:19:54 +020015079 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015080 PyErr_SetString(PyExc_TypeError,
15081 "not all arguments converted during string formatting");
15082 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015083 }
15084
Victor Stinnera47082312012-10-04 02:19:54 +020015085 if (ctx.args_owned) {
15086 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087 }
Victor Stinnera47082312012-10-04 02:19:54 +020015088 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015089
Benjamin Peterson29060642009-01-31 22:14:21 +000015090 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015091 _PyUnicodeWriter_Dealloc(&ctx.writer);
15092 if (ctx.args_owned) {
15093 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015094 }
15095 return NULL;
15096}
15097
Jeremy Hylton938ace62002-07-17 16:30:39 +000015098static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015099unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15100
Tim Peters6d6c1a32001-08-02 04:15:00 +000015101static PyObject *
15102unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15103{
Benjamin Peterson29060642009-01-31 22:14:21 +000015104 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 static char *kwlist[] = {"object", "encoding", "errors", 0};
15106 char *encoding = NULL;
15107 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015108
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 if (type != &PyUnicode_Type)
15110 return unicode_subtype_new(type, args, kwds);
15111 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015112 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 return NULL;
15114 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015115 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 if (encoding == NULL && errors == NULL)
15117 return PyObject_Str(x);
15118 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015119 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015120}
15121
Guido van Rossume023fe02001-08-30 03:12:59 +000015122static PyObject *
15123unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15124{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015125 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015126 Py_ssize_t length, char_size;
15127 int share_wstr, share_utf8;
15128 unsigned int kind;
15129 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015130
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015132
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015133 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015134 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015135 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015136 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015137 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015138 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015139 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015140 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015141
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015142 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015143 if (self == NULL) {
15144 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015145 return NULL;
15146 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015147 kind = PyUnicode_KIND(unicode);
15148 length = PyUnicode_GET_LENGTH(unicode);
15149
15150 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015151#ifdef Py_DEBUG
15152 _PyUnicode_HASH(self) = -1;
15153#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015154 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015155#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015156 _PyUnicode_STATE(self).interned = 0;
15157 _PyUnicode_STATE(self).kind = kind;
15158 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015159 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015160 _PyUnicode_STATE(self).ready = 1;
15161 _PyUnicode_WSTR(self) = NULL;
15162 _PyUnicode_UTF8_LENGTH(self) = 0;
15163 _PyUnicode_UTF8(self) = NULL;
15164 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015165 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015166
15167 share_utf8 = 0;
15168 share_wstr = 0;
15169 if (kind == PyUnicode_1BYTE_KIND) {
15170 char_size = 1;
15171 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15172 share_utf8 = 1;
15173 }
15174 else if (kind == PyUnicode_2BYTE_KIND) {
15175 char_size = 2;
15176 if (sizeof(wchar_t) == 2)
15177 share_wstr = 1;
15178 }
15179 else {
15180 assert(kind == PyUnicode_4BYTE_KIND);
15181 char_size = 4;
15182 if (sizeof(wchar_t) == 4)
15183 share_wstr = 1;
15184 }
15185
15186 /* Ensure we won't overflow the length. */
15187 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15188 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015189 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015190 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015191 data = PyObject_MALLOC((length + 1) * char_size);
15192 if (data == NULL) {
15193 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015194 goto onError;
15195 }
15196
Victor Stinnerc3c74152011-10-02 20:39:55 +020015197 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015198 if (share_utf8) {
15199 _PyUnicode_UTF8_LENGTH(self) = length;
15200 _PyUnicode_UTF8(self) = data;
15201 }
15202 if (share_wstr) {
15203 _PyUnicode_WSTR_LENGTH(self) = length;
15204 _PyUnicode_WSTR(self) = (wchar_t *)data;
15205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015206
Christian Heimesf051e432016-09-13 20:22:02 +020015207 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015208 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015209 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015210#ifdef Py_DEBUG
15211 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15212#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015213 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015214 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015215
15216onError:
15217 Py_DECREF(unicode);
15218 Py_DECREF(self);
15219 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015220}
15221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015222PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015223"str(object='') -> str\n\
15224str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015225\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015226Create a new string object from the given object. If encoding or\n\
15227errors is specified, then the object must expose a data buffer\n\
15228that will be decoded using the given encoding and error handler.\n\
15229Otherwise, returns the result of object.__str__() (if defined)\n\
15230or repr(object).\n\
15231encoding defaults to sys.getdefaultencoding().\n\
15232errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015233
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015234static PyObject *unicode_iter(PyObject *seq);
15235
Guido van Rossumd57fd912000-03-10 22:53:23 +000015236PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015237 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015238 "str", /* tp_name */
15239 sizeof(PyUnicodeObject), /* tp_basicsize */
15240 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015241 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015242 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015243 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015244 0, /* tp_getattr */
15245 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015246 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015247 unicode_repr, /* tp_repr */
15248 &unicode_as_number, /* tp_as_number */
15249 &unicode_as_sequence, /* tp_as_sequence */
15250 &unicode_as_mapping, /* tp_as_mapping */
15251 (hashfunc) unicode_hash, /* tp_hash*/
15252 0, /* tp_call*/
15253 (reprfunc) unicode_str, /* tp_str */
15254 PyObject_GenericGetAttr, /* tp_getattro */
15255 0, /* tp_setattro */
15256 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015258 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15259 unicode_doc, /* tp_doc */
15260 0, /* tp_traverse */
15261 0, /* tp_clear */
15262 PyUnicode_RichCompare, /* tp_richcompare */
15263 0, /* tp_weaklistoffset */
15264 unicode_iter, /* tp_iter */
15265 0, /* tp_iternext */
15266 unicode_methods, /* tp_methods */
15267 0, /* tp_members */
15268 0, /* tp_getset */
15269 &PyBaseObject_Type, /* tp_base */
15270 0, /* tp_dict */
15271 0, /* tp_descr_get */
15272 0, /* tp_descr_set */
15273 0, /* tp_dictoffset */
15274 0, /* tp_init */
15275 0, /* tp_alloc */
15276 unicode_new, /* tp_new */
15277 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015278};
15279
15280/* Initialize the Unicode implementation */
15281
Victor Stinner331a6a52019-05-27 16:39:22 +020015282PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015283_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015284{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015285 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015286 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015287 0x000A, /* LINE FEED */
15288 0x000D, /* CARRIAGE RETURN */
15289 0x001C, /* FILE SEPARATOR */
15290 0x001D, /* GROUP SEPARATOR */
15291 0x001E, /* RECORD SEPARATOR */
15292 0x0085, /* NEXT LINE */
15293 0x2028, /* LINE SEPARATOR */
15294 0x2029, /* PARAGRAPH SEPARATOR */
15295 };
15296
Fred Drakee4315f52000-05-09 19:53:39 +000015297 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015298 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015299 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015300 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015301 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015302 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015303
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015304 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015305 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015306 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015307
15308 /* initialize the linebreak bloom filter */
15309 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015310 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015311 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015312
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015313 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015314 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015315 }
15316 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015317 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015318 }
15319 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015320 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015321 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015322 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015323}
15324
15325/* Finalize the Unicode implementation */
15326
Christian Heimesa156e092008-02-16 07:38:31 +000015327int
15328PyUnicode_ClearFreeList(void)
15329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015330 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015331}
15332
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015333
Walter Dörwald16807132007-05-25 13:52:07 +000015334void
15335PyUnicode_InternInPlace(PyObject **p)
15336{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015337 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015338 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015339#ifdef Py_DEBUG
15340 assert(s != NULL);
15341 assert(_PyUnicode_CHECK(s));
15342#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015344 return;
15345#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 /* If it's a subclass, we don't really know what putting
15347 it in the interned dict might do. */
15348 if (!PyUnicode_CheckExact(s))
15349 return;
15350 if (PyUnicode_CHECK_INTERNED(s))
15351 return;
15352 if (interned == NULL) {
15353 interned = PyDict_New();
15354 if (interned == NULL) {
15355 PyErr_Clear(); /* Don't leave an exception */
15356 return;
15357 }
15358 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015360 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015362 if (t == NULL) {
15363 PyErr_Clear();
15364 return;
15365 }
15366 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015367 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015368 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015369 return;
15370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 /* The two references in interned are not counted by refcnt.
15372 The deallocator will take care of this */
15373 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015374 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015375}
15376
15377void
15378PyUnicode_InternImmortal(PyObject **p)
15379{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 PyUnicode_InternInPlace(p);
15381 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015382 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 Py_INCREF(*p);
15384 }
Walter Dörwald16807132007-05-25 13:52:07 +000015385}
15386
15387PyObject *
15388PyUnicode_InternFromString(const char *cp)
15389{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 PyObject *s = PyUnicode_FromString(cp);
15391 if (s == NULL)
15392 return NULL;
15393 PyUnicode_InternInPlace(&s);
15394 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015395}
15396
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015397
15398#if defined(WITH_VALGRIND) || defined(__INSURE__)
15399static void
15400unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015401{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015403 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015404 Py_ssize_t i, n;
15405 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015406
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 if (interned == NULL || !PyDict_Check(interned))
15408 return;
15409 keys = PyDict_Keys(interned);
15410 if (keys == NULL || !PyList_Check(keys)) {
15411 PyErr_Clear();
15412 return;
15413 }
Walter Dörwald16807132007-05-25 13:52:07 +000015414
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015415 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 detector, interned unicode strings are not forcibly deallocated;
15417 rather, we give them their stolen references back, and then clear
15418 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015419
Benjamin Peterson14339b62009-01-31 16:36:08 +000015420 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015421#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015423 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015424#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015425 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015426 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015427 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015428 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015429 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015430 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015431 case SSTATE_NOT_INTERNED:
15432 /* XXX Shouldn't happen */
15433 break;
15434 case SSTATE_INTERNED_IMMORTAL:
15435 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015436 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 break;
15438 case SSTATE_INTERNED_MORTAL:
15439 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015440 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 break;
15442 default:
15443 Py_FatalError("Inconsistent interned string state.");
15444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015445 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015447#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 fprintf(stderr, "total size of all interned strings: "
15449 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15450 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015451#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 Py_DECREF(keys);
15453 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015454 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015455}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015456#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015457
15458
15459/********************* Unicode Iterator **************************/
15460
15461typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 PyObject_HEAD
15463 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015464 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015465} unicodeiterobject;
15466
15467static void
15468unicodeiter_dealloc(unicodeiterobject *it)
15469{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015470 _PyObject_GC_UNTRACK(it);
15471 Py_XDECREF(it->it_seq);
15472 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015473}
15474
15475static int
15476unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15477{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015478 Py_VISIT(it->it_seq);
15479 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015480}
15481
15482static PyObject *
15483unicodeiter_next(unicodeiterobject *it)
15484{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015485 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015486
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 assert(it != NULL);
15488 seq = it->it_seq;
15489 if (seq == NULL)
15490 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015491 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015493 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15494 int kind = PyUnicode_KIND(seq);
15495 void *data = PyUnicode_DATA(seq);
15496 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15497 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015498 if (item != NULL)
15499 ++it->it_index;
15500 return item;
15501 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015502
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015504 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015506}
15507
15508static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015509unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015510{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 Py_ssize_t len = 0;
15512 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015513 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015514 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015515}
15516
15517PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15518
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015519static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015520unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015521{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015522 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015523 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015524 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015525 it->it_seq, it->it_index);
15526 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015527 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015528 if (u == NULL)
15529 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015530 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015531 }
15532}
15533
15534PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15535
15536static PyObject *
15537unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15538{
15539 Py_ssize_t index = PyLong_AsSsize_t(state);
15540 if (index == -1 && PyErr_Occurred())
15541 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015542 if (it->it_seq != NULL) {
15543 if (index < 0)
15544 index = 0;
15545 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15546 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15547 it->it_index = index;
15548 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015549 Py_RETURN_NONE;
15550}
15551
15552PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15553
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015554static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015555 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015556 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015557 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15558 reduce_doc},
15559 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15560 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015561 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015562};
15563
15564PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015565 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15566 "str_iterator", /* tp_name */
15567 sizeof(unicodeiterobject), /* tp_basicsize */
15568 0, /* tp_itemsize */
15569 /* methods */
15570 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015571 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 0, /* tp_getattr */
15573 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015574 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015575 0, /* tp_repr */
15576 0, /* tp_as_number */
15577 0, /* tp_as_sequence */
15578 0, /* tp_as_mapping */
15579 0, /* tp_hash */
15580 0, /* tp_call */
15581 0, /* tp_str */
15582 PyObject_GenericGetAttr, /* tp_getattro */
15583 0, /* tp_setattro */
15584 0, /* tp_as_buffer */
15585 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15586 0, /* tp_doc */
15587 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15588 0, /* tp_clear */
15589 0, /* tp_richcompare */
15590 0, /* tp_weaklistoffset */
15591 PyObject_SelfIter, /* tp_iter */
15592 (iternextfunc)unicodeiter_next, /* tp_iternext */
15593 unicodeiter_methods, /* tp_methods */
15594 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015595};
15596
15597static PyObject *
15598unicode_iter(PyObject *seq)
15599{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015600 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015601
Benjamin Peterson14339b62009-01-31 16:36:08 +000015602 if (!PyUnicode_Check(seq)) {
15603 PyErr_BadInternalCall();
15604 return NULL;
15605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015606 if (PyUnicode_READY(seq) == -1)
15607 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015608 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15609 if (it == NULL)
15610 return NULL;
15611 it->it_index = 0;
15612 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015613 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015614 _PyObject_GC_TRACK(it);
15615 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015616}
15617
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015618
15619size_t
15620Py_UNICODE_strlen(const Py_UNICODE *u)
15621{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015622 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015623}
15624
15625Py_UNICODE*
15626Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15627{
15628 Py_UNICODE *u = s1;
15629 while ((*u++ = *s2++));
15630 return s1;
15631}
15632
15633Py_UNICODE*
15634Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15635{
15636 Py_UNICODE *u = s1;
15637 while ((*u++ = *s2++))
15638 if (n-- == 0)
15639 break;
15640 return s1;
15641}
15642
15643Py_UNICODE*
15644Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15645{
15646 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015647 u1 += wcslen(u1);
15648 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015649 return s1;
15650}
15651
15652int
15653Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15654{
15655 while (*s1 && *s2 && *s1 == *s2)
15656 s1++, s2++;
15657 if (*s1 && *s2)
15658 return (*s1 < *s2) ? -1 : +1;
15659 if (*s1)
15660 return 1;
15661 if (*s2)
15662 return -1;
15663 return 0;
15664}
15665
15666int
15667Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15668{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015669 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015670 for (; n != 0; n--) {
15671 u1 = *s1;
15672 u2 = *s2;
15673 if (u1 != u2)
15674 return (u1 < u2) ? -1 : +1;
15675 if (u1 == '\0')
15676 return 0;
15677 s1++;
15678 s2++;
15679 }
15680 return 0;
15681}
15682
15683Py_UNICODE*
15684Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15685{
15686 const Py_UNICODE *p;
15687 for (p = s; *p; p++)
15688 if (*p == c)
15689 return (Py_UNICODE*)p;
15690 return NULL;
15691}
15692
15693Py_UNICODE*
15694Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15695{
15696 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015697 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015698 while (p != s) {
15699 p--;
15700 if (*p == c)
15701 return (Py_UNICODE*)p;
15702 }
15703 return NULL;
15704}
Victor Stinner331ea922010-08-10 16:37:20 +000015705
Victor Stinner71133ff2010-09-01 23:43:53 +000015706Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015707PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015708{
Victor Stinner577db2c2011-10-11 22:12:48 +020015709 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015710 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015712 if (!PyUnicode_Check(unicode)) {
15713 PyErr_BadArgument();
15714 return NULL;
15715 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015716 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015717 if (u == NULL)
15718 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015719 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015720 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015721 PyErr_NoMemory();
15722 return NULL;
15723 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015724 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015725 size *= sizeof(Py_UNICODE);
15726 copy = PyMem_Malloc(size);
15727 if (copy == NULL) {
15728 PyErr_NoMemory();
15729 return NULL;
15730 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015731 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015732 return copy;
15733}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015734
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015735
Victor Stinner709d23d2019-05-02 14:56:30 -040015736static int
15737encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015738{
Victor Stinner709d23d2019-05-02 14:56:30 -040015739 int res;
15740 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15741 if (res == -2) {
15742 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15743 return -1;
15744 }
15745 if (res < 0) {
15746 PyErr_NoMemory();
15747 return -1;
15748 }
15749 return 0;
15750}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015751
Victor Stinner709d23d2019-05-02 14:56:30 -040015752
15753static int
15754config_get_codec_name(wchar_t **config_encoding)
15755{
15756 char *encoding;
15757 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15758 return -1;
15759 }
15760
15761 PyObject *name_obj = NULL;
15762 PyObject *codec = _PyCodec_Lookup(encoding);
15763 PyMem_RawFree(encoding);
15764
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015765 if (!codec)
15766 goto error;
15767
15768 name_obj = PyObject_GetAttrString(codec, "name");
15769 Py_CLEAR(codec);
15770 if (!name_obj) {
15771 goto error;
15772 }
15773
Victor Stinner709d23d2019-05-02 14:56:30 -040015774 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15775 Py_DECREF(name_obj);
15776 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015777 goto error;
15778 }
15779
Victor Stinner709d23d2019-05-02 14:56:30 -040015780 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15781 if (raw_wname == NULL) {
15782 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015783 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015784 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015785 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015786
15787 PyMem_RawFree(*config_encoding);
15788 *config_encoding = raw_wname;
15789
15790 PyMem_Free(wname);
15791 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015792
15793error:
15794 Py_XDECREF(codec);
15795 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015796 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015797}
15798
15799
Victor Stinner331a6a52019-05-27 16:39:22 +020015800static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015801init_stdio_encoding(PyInterpreterState *interp)
15802{
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015804 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015805 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015806 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015807 "of the stdio encoding");
15808 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015809 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015810}
15811
15812
Victor Stinner709d23d2019-05-02 14:56:30 -040015813static int
15814init_fs_codec(PyInterpreterState *interp)
15815{
Victor Stinner331a6a52019-05-27 16:39:22 +020015816 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015817
15818 _Py_error_handler error_handler;
15819 error_handler = get_error_handler_wide(config->filesystem_errors);
15820 if (error_handler == _Py_ERROR_UNKNOWN) {
15821 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15822 return -1;
15823 }
15824
15825 char *encoding, *errors;
15826 if (encode_wstr_utf8(config->filesystem_encoding,
15827 &encoding,
15828 "filesystem_encoding") < 0) {
15829 return -1;
15830 }
15831
15832 if (encode_wstr_utf8(config->filesystem_errors,
15833 &errors,
15834 "filesystem_errors") < 0) {
15835 PyMem_RawFree(encoding);
15836 return -1;
15837 }
15838
15839 PyMem_RawFree(interp->fs_codec.encoding);
15840 interp->fs_codec.encoding = encoding;
15841 PyMem_RawFree(interp->fs_codec.errors);
15842 interp->fs_codec.errors = errors;
15843 interp->fs_codec.error_handler = error_handler;
15844
15845 /* At this point, PyUnicode_EncodeFSDefault() and
15846 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15847 the C implementation of the filesystem encoding. */
15848
15849 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15850 global configuration variables. */
15851 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15852 interp->fs_codec.errors) < 0) {
15853 PyErr_NoMemory();
15854 return -1;
15855 }
15856 return 0;
15857}
15858
15859
Victor Stinner331a6a52019-05-27 16:39:22 +020015860static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015861init_fs_encoding(PyInterpreterState *interp)
15862{
Victor Stinner709d23d2019-05-02 14:56:30 -040015863 /* Update the filesystem encoding to the normalized Python codec name.
15864 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15865 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015866 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015867 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015868 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015869 "of the filesystem encoding");
15870 }
15871
Victor Stinner709d23d2019-05-02 14:56:30 -040015872 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015873 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015874 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015875 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015876}
15877
15878
Victor Stinner331a6a52019-05-27 16:39:22 +020015879PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015880_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015881{
Victor Stinnerb45d2592019-06-20 00:05:23 +020015882 PyInterpreterState *interp = tstate->interp;
15883
Victor Stinner331a6a52019-05-27 16:39:22 +020015884 PyStatus status = init_fs_encoding(interp);
15885 if (_PyStatus_EXCEPTION(status)) {
15886 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015887 }
15888
15889 return init_stdio_encoding(interp);
15890}
15891
15892
Victor Stinner709d23d2019-05-02 14:56:30 -040015893#ifdef MS_WINDOWS
15894int
15895_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15896{
15897 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015898 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015899
15900 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15901 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15902 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15903 if (encoding == NULL || errors == NULL) {
15904 PyMem_RawFree(encoding);
15905 PyMem_RawFree(errors);
15906 PyErr_NoMemory();
15907 return -1;
15908 }
15909
15910 PyMem_RawFree(config->filesystem_encoding);
15911 config->filesystem_encoding = encoding;
15912 PyMem_RawFree(config->filesystem_errors);
15913 config->filesystem_errors = errors;
15914
15915 return init_fs_codec(interp);
15916}
15917#endif
15918
15919
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015920void
15921_PyUnicode_Fini(void)
15922{
15923#if defined(WITH_VALGRIND) || defined(__INSURE__)
15924 /* Insure++ is a memory analysis tool that aids in discovering
15925 * memory leaks and other memory problems. On Python exit, the
15926 * interned string dictionaries are flagged as being in use at exit
15927 * (which it is). Under normal circumstances, this is fine because
15928 * the memory will be automatically reclaimed by the system. Under
15929 * memory debugging, it's a huge source of useless noise, so we
15930 * trade off slower shutdown for less distraction in the memory
15931 * reports. -baw
15932 */
15933 unicode_release_interned();
15934#endif /* __INSURE__ */
15935
15936 Py_CLEAR(unicode_empty);
15937
15938 for (Py_ssize_t i = 0; i < 256; i++) {
15939 Py_CLEAR(unicode_latin1[i]);
15940 }
15941 _PyUnicode_ClearStaticStrings();
15942 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015943
15944 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15945 PyMem_RawFree(interp->fs_codec.encoding);
15946 interp->fs_codec.encoding = NULL;
15947 PyMem_RawFree(interp->fs_codec.errors);
15948 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015949}
15950
15951
Georg Brandl66c221e2010-10-14 07:04:07 +000015952/* A _string module, to export formatter_parser and formatter_field_name_split
15953 to the string.Formatter class implemented in Python. */
15954
15955static PyMethodDef _string_methods[] = {
15956 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15957 METH_O, PyDoc_STR("split the argument as a field name")},
15958 {"formatter_parser", (PyCFunction) formatter_parser,
15959 METH_O, PyDoc_STR("parse the argument as a format string")},
15960 {NULL, NULL}
15961};
15962
15963static struct PyModuleDef _string_module = {
15964 PyModuleDef_HEAD_INIT,
15965 "_string",
15966 PyDoc_STR("string helper module"),
15967 0,
15968 _string_methods,
15969 NULL,
15970 NULL,
15971 NULL,
15972 NULL
15973};
15974
15975PyMODINIT_FUNC
15976PyInit__string(void)
15977{
15978 return PyModule_Create(&_string_module);
15979}
15980
15981
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015982#ifdef __cplusplus
15983}
15984#endif