blob: 51d314b61a52c4d299b502500ba97be8e5d669e3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900268static inline void
269_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400270static PyObject *
271unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
272 const char *errors);
273static PyObject *
274unicode_decode_utf8(const char *s, Py_ssize_t size,
275 _Py_error_handler error_handler, const char *errors,
276 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200279static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281/* Single character Unicode strings in the Latin-1 range are being
282 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200283static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284
Christian Heimes190d79e2008-01-30 11:58:22 +0000285/* Fast detection of the most frequent whitespace characters */
286const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000C: * FORM FEED */
292/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 1, 1, 1, 1, 1, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x001C: * FILE SEPARATOR */
296/* case 0x001D: * GROUP SEPARATOR */
297/* case 0x001E: * RECORD SEPARATOR */
298/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 1, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200317static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100319static int unicode_modifiable(PyObject *unicode);
320
Victor Stinnerfe226c02011-10-03 03:52:20 +0200321
Alexander Belopolsky40018472011-02-26 01:02:56 +0000322static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100323_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200324static PyObject *
325_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
326static PyObject *
327_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
328
329static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000330unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100332 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
334
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335static void
336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100338 PyObject *unicode,
339 Py_ssize_t startpos, Py_ssize_t endpos,
340 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000341
Christian Heimes190d79e2008-01-30 11:58:22 +0000342/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200343static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000345/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000346/* 0x000B, * LINE TABULATION */
347/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x001C, * FILE SEPARATOR */
352/* 0x001D, * GROUP SEPARATOR */
353/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 1, 1, 1, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000359
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000368};
369
INADA Naoki3ae20562017-01-16 20:41:20 +0900370static int convert_uc(PyObject *obj, void *addr);
371
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300372#include "clinic/unicodeobject.c.h"
373
Victor Stinner3d4226a2018-08-29 22:21:32 +0200374_Py_error_handler
375_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200376{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200378 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 }
380 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200387 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_OTHER;
399}
400
Victor Stinner709d23d2019-05-02 14:56:30 -0400401
402static _Py_error_handler
403get_error_handler_wide(const wchar_t *errors)
404{
405 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
406 return _Py_ERROR_STRICT;
407 }
408 if (wcscmp(errors, L"surrogateescape") == 0) {
409 return _Py_ERROR_SURROGATEESCAPE;
410 }
411 if (wcscmp(errors, L"replace") == 0) {
412 return _Py_ERROR_REPLACE;
413 }
414 if (wcscmp(errors, L"ignore") == 0) {
415 return _Py_ERROR_IGNORE;
416 }
417 if (wcscmp(errors, L"backslashreplace") == 0) {
418 return _Py_ERROR_BACKSLASHREPLACE;
419 }
420 if (wcscmp(errors, L"surrogatepass") == 0) {
421 return _Py_ERROR_SURROGATEPASS;
422 }
423 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
424 return _Py_ERROR_XMLCHARREFREPLACE;
425 }
426 return _Py_ERROR_OTHER;
427}
428
429
Victor Stinner22eb6892019-06-26 00:51:05 +0200430static inline int
431unicode_check_encoding_errors(const char *encoding, const char *errors)
432{
433 if (encoding == NULL && errors == NULL) {
434 return 0;
435 }
436
437 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
438#ifndef Py_DEBUG
439 /* In release mode, only check in development mode (-X dev) */
440 if (!interp->config.dev_mode) {
441 return 0;
442 }
443#else
444 /* Always check in debug mode */
445#endif
446
447 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
448 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
449 if (!interp->fs_codec.encoding) {
450 return 0;
451 }
452
453 if (encoding != NULL) {
454 PyObject *handler = _PyCodec_Lookup(encoding);
455 if (handler == NULL) {
456 return -1;
457 }
458 Py_DECREF(handler);
459 }
460
461 if (errors != NULL) {
462 PyObject *handler = PyCodec_LookupError(errors);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468 return 0;
469}
470
471
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300472/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
473 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000474Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000475PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000476{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000477#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000478 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000479#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 /* This is actually an illegal character, so it should
481 not be passed to unichr. */
482 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000483#endif
484}
485
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200486int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100487_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200488{
489 PyASCIIObject *ascii;
490 unsigned int kind;
491
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200492 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200493
494 ascii = (PyASCIIObject *)op;
495 kind = ascii->state.kind;
496
Victor Stinnera3b334d2011-10-03 13:53:37 +0200497 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200498 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
499 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200500 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200501 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200502 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200503 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200504
Victor Stinnera41463c2011-10-04 01:05:08 +0200505 if (ascii->state.compact == 1) {
506 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200507 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
508 || kind == PyUnicode_2BYTE_KIND
509 || kind == PyUnicode_4BYTE_KIND);
510 _PyObject_ASSERT(op, ascii->state.ascii == 0);
511 _PyObject_ASSERT(op, ascii->state.ready == 1);
512 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100513 }
514 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200515 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
516
517 data = unicode->data.any;
518 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200519 _PyObject_ASSERT(op, ascii->length == 0);
520 _PyObject_ASSERT(op, ascii->hash == -1);
521 _PyObject_ASSERT(op, ascii->state.compact == 0);
522 _PyObject_ASSERT(op, ascii->state.ascii == 0);
523 _PyObject_ASSERT(op, ascii->state.ready == 0);
524 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
525 _PyObject_ASSERT(op, ascii->wstr != NULL);
526 _PyObject_ASSERT(op, data == NULL);
527 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 }
529 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200530 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
531 || kind == PyUnicode_2BYTE_KIND
532 || kind == PyUnicode_4BYTE_KIND);
533 _PyObject_ASSERT(op, ascii->state.compact == 0);
534 _PyObject_ASSERT(op, ascii->state.ready == 1);
535 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200536 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, compact->utf8 == data);
538 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200539 }
540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 }
543 }
544 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200545 if (
546#if SIZEOF_WCHAR_T == 2
547 kind == PyUnicode_2BYTE_KIND
548#else
549 kind == PyUnicode_4BYTE_KIND
550#endif
551 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200552 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200553 _PyObject_ASSERT(op, ascii->wstr == data);
554 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200556 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200557 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200558
559 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200560 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200562 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200564
565 /* check that the best kind is used: O(n) operation */
566 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 Py_ssize_t i;
568 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200569 void *data;
570 Py_UCS4 ch;
571
572 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 for (i=0; i < ascii->length; i++)
574 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200575 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 if (ch > maxchar)
577 maxchar = ch;
578 }
579 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100580 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200581 _PyObject_ASSERT(op, maxchar >= 128);
582 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100583 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200584 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200585 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200586 }
Victor Stinner77faf692011-11-20 18:56:05 +0100587 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200588 _PyObject_ASSERT(op, maxchar >= 0x100);
589 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100590 }
591 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200592 _PyObject_ASSERT(op, maxchar >= 0x10000);
593 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100594 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200595 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400597 return 1;
598}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200599
Victor Stinner910337b2011-10-03 03:20:16 +0200600
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100601static PyObject*
602unicode_result_wchar(PyObject *unicode)
603{
604#ifndef Py_DEBUG
605 Py_ssize_t len;
606
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100607 len = _PyUnicode_WSTR_LENGTH(unicode);
608 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100609 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200610 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100611 }
612
613 if (len == 1) {
614 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100615 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
617 Py_DECREF(unicode);
618 return latin1_char;
619 }
620 }
621
622 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200623 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 return NULL;
625 }
626#else
Victor Stinneraa771272012-10-04 02:32:58 +0200627 assert(Py_REFCNT(unicode) == 1);
628
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 /* don't make the result ready in debug mode to ensure that the caller
630 makes the string ready before using it */
631 assert(_PyUnicode_CheckConsistency(unicode, 1));
632#endif
633 return unicode;
634}
635
636static PyObject*
637unicode_result_ready(PyObject *unicode)
638{
639 Py_ssize_t length;
640
641 length = PyUnicode_GET_LENGTH(unicode);
642 if (length == 0) {
643 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100644 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200645 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100646 }
647 return unicode_empty;
648 }
649
650 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200651 void *data = PyUnicode_DATA(unicode);
652 int kind = PyUnicode_KIND(unicode);
653 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 if (ch < 256) {
655 PyObject *latin1_char = unicode_latin1[ch];
656 if (latin1_char != NULL) {
657 if (unicode != latin1_char) {
658 Py_INCREF(latin1_char);
659 Py_DECREF(unicode);
660 }
661 return latin1_char;
662 }
663 else {
664 assert(_PyUnicode_CheckConsistency(unicode, 1));
665 Py_INCREF(unicode);
666 unicode_latin1[ch] = unicode;
667 return unicode;
668 }
669 }
670 }
671
672 assert(_PyUnicode_CheckConsistency(unicode, 1));
673 return unicode;
674}
675
676static PyObject*
677unicode_result(PyObject *unicode)
678{
679 assert(_PyUnicode_CHECK(unicode));
680 if (PyUnicode_IS_READY(unicode))
681 return unicode_result_ready(unicode);
682 else
683 return unicode_result_wchar(unicode);
684}
685
Victor Stinnerc4b49542011-12-11 22:44:26 +0100686static PyObject*
687unicode_result_unchanged(PyObject *unicode)
688{
689 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500690 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100691 return NULL;
692 Py_INCREF(unicode);
693 return unicode;
694 }
695 else
696 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100697 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698}
699
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200700/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
701 ASCII, Latin1, UTF-8, etc. */
702static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200703backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
705{
Victor Stinnerad771582015-10-09 12:38:53 +0200706 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707 Py_UCS4 ch;
708 enum PyUnicode_Kind kind;
709 void *data;
710
711 assert(PyUnicode_IS_READY(unicode));
712 kind = PyUnicode_KIND(unicode);
713 data = PyUnicode_DATA(unicode);
714
715 size = 0;
716 /* determine replacement size */
717 for (i = collstart; i < collend; ++i) {
718 Py_ssize_t incr;
719
720 ch = PyUnicode_READ(kind, data, i);
721 if (ch < 0x100)
722 incr = 2+2;
723 else if (ch < 0x10000)
724 incr = 2+4;
725 else {
726 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200727 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 }
729 if (size > PY_SSIZE_T_MAX - incr) {
730 PyErr_SetString(PyExc_OverflowError,
731 "encoded result is too long for a Python string");
732 return NULL;
733 }
734 size += incr;
735 }
736
Victor Stinnerad771582015-10-09 12:38:53 +0200737 str = _PyBytesWriter_Prepare(writer, str, size);
738 if (str == NULL)
739 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200740
741 /* generate replacement */
742 for (i = collstart; i < collend; ++i) {
743 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200744 *str++ = '\\';
745 if (ch >= 0x00010000) {
746 *str++ = 'U';
747 *str++ = Py_hexdigits[(ch>>28)&0xf];
748 *str++ = Py_hexdigits[(ch>>24)&0xf];
749 *str++ = Py_hexdigits[(ch>>20)&0xf];
750 *str++ = Py_hexdigits[(ch>>16)&0xf];
751 *str++ = Py_hexdigits[(ch>>12)&0xf];
752 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200753 }
Victor Stinner797485e2015-10-09 03:17:30 +0200754 else if (ch >= 0x100) {
755 *str++ = 'u';
756 *str++ = Py_hexdigits[(ch>>12)&0xf];
757 *str++ = Py_hexdigits[(ch>>8)&0xf];
758 }
759 else
760 *str++ = 'x';
761 *str++ = Py_hexdigits[(ch>>4)&0xf];
762 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200763 }
764 return str;
765}
766
767/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
768 ASCII, Latin1, UTF-8, etc. */
769static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200770xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
772{
Victor Stinnerad771582015-10-09 12:38:53 +0200773 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200774 Py_UCS4 ch;
775 enum PyUnicode_Kind kind;
776 void *data;
777
778 assert(PyUnicode_IS_READY(unicode));
779 kind = PyUnicode_KIND(unicode);
780 data = PyUnicode_DATA(unicode);
781
782 size = 0;
783 /* determine replacement size */
784 for (i = collstart; i < collend; ++i) {
785 Py_ssize_t incr;
786
787 ch = PyUnicode_READ(kind, data, i);
788 if (ch < 10)
789 incr = 2+1+1;
790 else if (ch < 100)
791 incr = 2+2+1;
792 else if (ch < 1000)
793 incr = 2+3+1;
794 else if (ch < 10000)
795 incr = 2+4+1;
796 else if (ch < 100000)
797 incr = 2+5+1;
798 else if (ch < 1000000)
799 incr = 2+6+1;
800 else {
801 assert(ch <= MAX_UNICODE);
802 incr = 2+7+1;
803 }
804 if (size > PY_SSIZE_T_MAX - incr) {
805 PyErr_SetString(PyExc_OverflowError,
806 "encoded result is too long for a Python string");
807 return NULL;
808 }
809 size += incr;
810 }
811
Victor Stinnerad771582015-10-09 12:38:53 +0200812 str = _PyBytesWriter_Prepare(writer, str, size);
813 if (str == NULL)
814 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200815
816 /* generate replacement */
817 for (i = collstart; i < collend; ++i) {
818 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
819 }
820 return str;
821}
822
Thomas Wouters477c8d52006-05-27 19:21:47 +0000823/* --- Bloom Filters ----------------------------------------------------- */
824
825/* stuff to implement simple "bloom filters" for Unicode characters.
826 to keep things simple, we use a single bitmask, using the least 5
827 bits from each unicode characters as the bit index. */
828
829/* the linebreak mask is set up by Unicode_Init below */
830
Antoine Pitrouf068f942010-01-13 14:19:12 +0000831#if LONG_BIT >= 128
832#define BLOOM_WIDTH 128
833#elif LONG_BIT >= 64
834#define BLOOM_WIDTH 64
835#elif LONG_BIT >= 32
836#define BLOOM_WIDTH 32
837#else
838#error "LONG_BIT is smaller than 32"
839#endif
840
Thomas Wouters477c8d52006-05-27 19:21:47 +0000841#define BLOOM_MASK unsigned long
842
Serhiy Storchaka05997252013-01-26 12:14:02 +0200843static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844
Antoine Pitrouf068f942010-01-13 14:19:12 +0000845#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846
Benjamin Peterson29060642009-01-31 22:14:21 +0000847#define BLOOM_LINEBREAK(ch) \
848 ((ch) < 128U ? ascii_linebreak[(ch)] : \
849 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000850
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700851static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853{
Victor Stinnera85af502013-04-09 21:53:54 +0200854#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
855 do { \
856 TYPE *data = (TYPE *)PTR; \
857 TYPE *end = data + LEN; \
858 Py_UCS4 ch; \
859 for (; data != end; data++) { \
860 ch = *data; \
861 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
862 } \
863 break; \
864 } while (0)
865
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866 /* calculate simple bloom-style bitmask for a given unicode string */
867
Antoine Pitrouf068f942010-01-13 14:19:12 +0000868 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869
870 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200871 switch (kind) {
872 case PyUnicode_1BYTE_KIND:
873 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
874 break;
875 case PyUnicode_2BYTE_KIND:
876 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
877 break;
878 case PyUnicode_4BYTE_KIND:
879 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
880 break;
881 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700882 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200883 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200885
886#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000887}
888
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300889static int
890ensure_unicode(PyObject *obj)
891{
892 if (!PyUnicode_Check(obj)) {
893 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200894 "must be str, not %.100s",
895 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896 return -1;
897 }
898 return PyUnicode_READY(obj);
899}
900
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200901/* Compilation of templated routines */
902
903#include "stringlib/asciilib.h"
904#include "stringlib/fastsearch.h"
905#include "stringlib/partition.h"
906#include "stringlib/split.h"
907#include "stringlib/count.h"
908#include "stringlib/find.h"
909#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200910#include "stringlib/undef.h"
911
912#include "stringlib/ucs1lib.h"
913#include "stringlib/fastsearch.h"
914#include "stringlib/partition.h"
915#include "stringlib/split.h"
916#include "stringlib/count.h"
917#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300918#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200919#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200920#include "stringlib/undef.h"
921
922#include "stringlib/ucs2lib.h"
923#include "stringlib/fastsearch.h"
924#include "stringlib/partition.h"
925#include "stringlib/split.h"
926#include "stringlib/count.h"
927#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300928#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200930#include "stringlib/undef.h"
931
932#include "stringlib/ucs4lib.h"
933#include "stringlib/fastsearch.h"
934#include "stringlib/partition.h"
935#include "stringlib/split.h"
936#include "stringlib/count.h"
937#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300938#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200939#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200940#include "stringlib/undef.h"
941
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200942#include "stringlib/unicodedefs.h"
943#include "stringlib/fastsearch.h"
944#include "stringlib/count.h"
945#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100946#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200947
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948/* --- Unicode Object ----------------------------------------------------- */
949
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700950static inline Py_ssize_t
951findchar(const void *s, int kind,
952 Py_ssize_t size, Py_UCS4 ch,
953 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200955 switch (kind) {
956 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200957 if ((Py_UCS1) ch != ch)
958 return -1;
959 if (direction > 0)
960 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
961 else
962 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200963 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS2) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
968 else
969 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if (direction > 0)
972 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
973 else
974 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200975 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700976 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978}
979
Victor Stinnerafffce42012-10-03 23:03:17 +0200980#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000981/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200982 earlier.
983
984 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
985 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
986 invalid character in Unicode 6.0. */
987static void
988unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
989{
990 int kind = PyUnicode_KIND(unicode);
991 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
992 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
993 if (length <= old_length)
994 return;
995 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
996}
997#endif
998
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999static PyObject*
1000resize_compact(PyObject *unicode, Py_ssize_t length)
1001{
1002 Py_ssize_t char_size;
1003 Py_ssize_t struct_size;
1004 Py_ssize_t new_size;
1005 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001006 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001007#ifdef Py_DEBUG
1008 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1009#endif
1010
Victor Stinner79891572012-05-03 13:43:07 +02001011 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001012 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001013 assert(PyUnicode_IS_COMPACT(unicode));
1014
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001015 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001016 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 struct_size = sizeof(PyASCIIObject);
1018 else
1019 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001020 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1023 PyErr_NoMemory();
1024 return NULL;
1025 }
1026 new_size = (struct_size + (length + 1) * char_size);
1027
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001028 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1029 PyObject_DEL(_PyUnicode_UTF8(unicode));
1030 _PyUnicode_UTF8(unicode) = NULL;
1031 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1032 }
Victor Stinner84def372011-12-11 20:04:56 +01001033 _Py_DEC_REFTOTAL;
1034 _Py_ForgetReference(unicode);
1035
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001036 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001037 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001038 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 PyErr_NoMemory();
1040 return NULL;
1041 }
Victor Stinner84def372011-12-11 20:04:56 +01001042 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001044
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001046 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001048 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001049 _PyUnicode_WSTR_LENGTH(unicode) = length;
1050 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001051 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1052 PyObject_DEL(_PyUnicode_WSTR(unicode));
1053 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001054 if (!PyUnicode_IS_ASCII(unicode))
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001056 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001057#ifdef Py_DEBUG
1058 unicode_fill_invalid(unicode, old_length);
1059#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001060 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1061 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001062 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 return unicode;
1064}
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001067resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Victor Stinner95663112011-10-04 01:03:50 +02001069 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001070 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001073
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 if (PyUnicode_IS_READY(unicode)) {
1075 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001076 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001078#ifdef Py_DEBUG
1079 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1080#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081
1082 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001083 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1085 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001086
1087 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1088 PyErr_NoMemory();
1089 return -1;
1090 }
1091 new_size = (length + 1) * char_size;
1092
Victor Stinner7a9105a2011-12-12 00:13:42 +01001093 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1094 {
1095 PyObject_DEL(_PyUnicode_UTF8(unicode));
1096 _PyUnicode_UTF8(unicode) = NULL;
1097 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1098 }
1099
Victor Stinnerfe226c02011-10-03 03:52:20 +02001100 data = (PyObject *)PyObject_REALLOC(data, new_size);
1101 if (data == NULL) {
1102 PyErr_NoMemory();
1103 return -1;
1104 }
1105 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001106 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 }
1110 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001112 _PyUnicode_UTF8_LENGTH(unicode) = length;
1113 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001114 _PyUnicode_LENGTH(unicode) = length;
1115 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001116#ifdef Py_DEBUG
1117 unicode_fill_invalid(unicode, old_length);
1118#endif
Victor Stinner95663112011-10-04 01:03:50 +02001119 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001120 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001123 }
Victor Stinner95663112011-10-04 01:03:50 +02001124 assert(_PyUnicode_WSTR(unicode) != NULL);
1125
1126 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001127 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001128 PyErr_NoMemory();
1129 return -1;
1130 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001131 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001132 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001133 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001134 if (!wstr) {
1135 PyErr_NoMemory();
1136 return -1;
1137 }
1138 _PyUnicode_WSTR(unicode) = wstr;
1139 _PyUnicode_WSTR(unicode)[length] = 0;
1140 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001141 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 return 0;
1143}
1144
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145static PyObject*
1146resize_copy(PyObject *unicode, Py_ssize_t length)
1147{
1148 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001149 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001150 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001152 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001153
1154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1155 if (copy == NULL)
1156 return NULL;
1157
1158 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001161 }
1162 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001163 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001164
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001165 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001166 if (w == NULL)
1167 return NULL;
1168 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1169 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001170 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001171 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001172 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001173 }
1174}
1175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001177 Ux0000 terminated; some code (e.g. new_identifier)
1178 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
1180 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001181 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183*/
1184
Alexander Belopolsky40018472011-02-26 01:02:56 +00001185static PyUnicodeObject *
1186_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001188 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
Thomas Wouters477c8d52006-05-27 19:21:47 +00001191 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if (length == 0 && unicode_empty != NULL) {
1193 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001194 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 }
1196
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001197 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001198 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001199 return (PyUnicodeObject *)PyErr_NoMemory();
1200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201 if (length < 0) {
1202 PyErr_SetString(PyExc_SystemError,
1203 "Negative size passed to _PyUnicode_New");
1204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 }
1206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1208 if (unicode == NULL)
1209 return NULL;
1210 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001211
1212 _PyUnicode_WSTR_LENGTH(unicode) = length;
1213 _PyUnicode_HASH(unicode) = -1;
1214 _PyUnicode_STATE(unicode).interned = 0;
1215 _PyUnicode_STATE(unicode).kind = 0;
1216 _PyUnicode_STATE(unicode).compact = 0;
1217 _PyUnicode_STATE(unicode).ready = 0;
1218 _PyUnicode_STATE(unicode).ascii = 0;
1219 _PyUnicode_DATA_ANY(unicode) = NULL;
1220 _PyUnicode_LENGTH(unicode) = 0;
1221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1225 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001226 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001227 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001228 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230
Jeremy Hyltond8082792003-09-16 19:41:39 +00001231 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001232 * the caller fails before initializing str -- unicode_resize()
1233 * reads str[0], and the Keep-Alive optimization can keep memory
1234 * allocated for str alive across a call to unicode_dealloc(unicode).
1235 * We don't want unicode_resize to read uninitialized memory in
1236 * that case.
1237 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 _PyUnicode_WSTR(unicode)[0] = 0;
1239 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001240
Victor Stinner7931d9a2011-11-04 00:22:48 +01001241 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 return unicode;
1243}
1244
Victor Stinnerf42dc442011-10-02 23:33:16 +02001245static const char*
1246unicode_kind_name(PyObject *unicode)
1247{
Victor Stinner42dfd712011-10-03 14:41:45 +02001248 /* don't check consistency: unicode_kind_name() is called from
1249 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001250 if (!PyUnicode_IS_COMPACT(unicode))
1251 {
1252 if (!PyUnicode_IS_READY(unicode))
1253 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001254 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001255 {
1256 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001257 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001258 return "legacy ascii";
1259 else
1260 return "legacy latin1";
1261 case PyUnicode_2BYTE_KIND:
1262 return "legacy UCS2";
1263 case PyUnicode_4BYTE_KIND:
1264 return "legacy UCS4";
1265 default:
1266 return "<legacy invalid kind>";
1267 }
1268 }
1269 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001270 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001272 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001273 return "ascii";
1274 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001275 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001277 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001278 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001279 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280 default:
1281 return "<invalid compact kind>";
1282 }
1283}
1284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001287char *_PyUnicode_utf8(void *unicode_raw){
1288 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001289 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290}
1291
Victor Stinnera42de742018-11-22 10:25:22 +01001292void *_PyUnicode_compact_data(void *unicode_raw) {
1293 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 return _PyUnicode_COMPACT_DATA(unicode);
1295}
Victor Stinnera42de742018-11-22 10:25:22 +01001296void *_PyUnicode_data(void *unicode_raw) {
1297 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001298 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1300 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1301 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1302 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1303 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1304 return PyUnicode_DATA(unicode);
1305}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001306
1307void
1308_PyUnicode_Dump(PyObject *op)
1309{
1310 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1312 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1313 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001314
Victor Stinnera849a4b2011-10-03 12:12:11 +02001315 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001316 {
1317 if (ascii->state.ascii)
1318 data = (ascii + 1);
1319 else
1320 data = (compact + 1);
1321 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 else
1323 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001324 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1325 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001326
Victor Stinnera849a4b2011-10-03 12:12:11 +02001327 if (ascii->wstr == data)
1328 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001329 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001330
Victor Stinnera3b334d2011-10-03 13:53:37 +02001331 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001332 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1334 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001336 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001337 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001339}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
1342PyObject *
1343PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1344{
1345 PyObject *obj;
1346 PyCompactUnicodeObject *unicode;
1347 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001348 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 Py_ssize_t char_size;
1351 Py_ssize_t struct_size;
1352
1353 /* Optimization for empty strings */
1354 if (size == 0 && unicode_empty != NULL) {
1355 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001356 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 }
1358
Victor Stinner9e9d6892011-10-04 01:02:02 +02001359 is_ascii = 0;
1360 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 struct_size = sizeof(PyCompactUnicodeObject);
1362 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001363 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 char_size = 1;
1365 is_ascii = 1;
1366 struct_size = sizeof(PyASCIIObject);
1367 }
1368 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001369 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 char_size = 1;
1371 }
1372 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001373 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 char_size = 2;
1375 if (sizeof(wchar_t) == 2)
1376 is_sharing = 1;
1377 }
1378 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001379 if (maxchar > MAX_UNICODE) {
1380 PyErr_SetString(PyExc_SystemError,
1381 "invalid maximum character passed to PyUnicode_New");
1382 return NULL;
1383 }
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 4;
1386 if (sizeof(wchar_t) == 4)
1387 is_sharing = 1;
1388 }
1389
1390 /* Ensure we won't overflow the size. */
1391 if (size < 0) {
1392 PyErr_SetString(PyExc_SystemError,
1393 "Negative size passed to PyUnicode_New");
1394 return NULL;
1395 }
1396 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1397 return PyErr_NoMemory();
1398
1399 /* Duplicated allocation code from _PyObject_New() instead of a call to
1400 * PyObject_New() so we are able to allocate space for the object and
1401 * it's data buffer.
1402 */
1403 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1404 if (obj == NULL)
1405 return PyErr_NoMemory();
1406 obj = PyObject_INIT(obj, &PyUnicode_Type);
1407 if (obj == NULL)
1408 return NULL;
1409
1410 unicode = (PyCompactUnicodeObject *)obj;
1411 if (is_ascii)
1412 data = ((PyASCIIObject*)obj) + 1;
1413 else
1414 data = unicode + 1;
1415 _PyUnicode_LENGTH(unicode) = size;
1416 _PyUnicode_HASH(unicode) = -1;
1417 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001418 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 _PyUnicode_STATE(unicode).compact = 1;
1420 _PyUnicode_STATE(unicode).ready = 1;
1421 _PyUnicode_STATE(unicode).ascii = is_ascii;
1422 if (is_ascii) {
1423 ((char*)data)[size] = 0;
1424 _PyUnicode_WSTR(unicode) = NULL;
1425 }
Victor Stinner8f825062012-04-27 13:55:39 +02001426 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 ((char*)data)[size] = 0;
1428 _PyUnicode_WSTR(unicode) = NULL;
1429 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001431 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 else {
1434 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001435 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001436 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001438 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 ((Py_UCS4*)data)[size] = 0;
1440 if (is_sharing) {
1441 _PyUnicode_WSTR_LENGTH(unicode) = size;
1442 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1443 }
1444 else {
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 }
1448 }
Victor Stinner8f825062012-04-27 13:55:39 +02001449#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001450 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001451#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001452 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 return obj;
1454}
1455
1456#if SIZEOF_WCHAR_T == 2
1457/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1458 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001459 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 This function assumes that unicode can hold one more code point than wstr
1462 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001463static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001465 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466{
1467 const wchar_t *iter;
1468 Py_UCS4 *ucs4_out;
1469
Victor Stinner910337b2011-10-03 03:20:16 +02001470 assert(unicode != NULL);
1471 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1473 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1474
1475 for (iter = begin; iter < end; ) {
1476 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1477 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001478 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1479 && (iter+1) < end
1480 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 {
Victor Stinner551ac952011-11-29 22:58:13 +01001482 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 iter += 2;
1484 }
1485 else {
1486 *ucs4_out++ = *iter;
1487 iter++;
1488 }
1489 }
1490 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1491 _PyUnicode_GET_LENGTH(unicode)));
1492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493}
1494#endif
1495
Victor Stinnercd9950f2011-10-02 00:34:53 +02001496static int
Victor Stinner488fa492011-12-12 00:01:39 +01001497unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001498{
Victor Stinner488fa492011-12-12 00:01:39 +01001499 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001500 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001501 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001502 return -1;
1503 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001504 return 0;
1505}
1506
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001507static int
1508_copy_characters(PyObject *to, Py_ssize_t to_start,
1509 PyObject *from, Py_ssize_t from_start,
1510 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 unsigned int from_kind, to_kind;
1513 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514
Victor Stinneree4544c2012-05-09 22:24:08 +02001515 assert(0 <= how_many);
1516 assert(0 <= from_start);
1517 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001519 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001520 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521
Victor Stinnerd3f08822012-05-29 12:57:52 +02001522 assert(PyUnicode_Check(to));
1523 assert(PyUnicode_IS_READY(to));
1524 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1525
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001526 if (how_many == 0)
1527 return 0;
1528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001532 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533
Victor Stinnerf1852262012-06-16 16:38:26 +02001534#ifdef Py_DEBUG
1535 if (!check_maxchar
1536 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1537 {
1538 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1539 Py_UCS4 ch;
1540 Py_ssize_t i;
1541 for (i=0; i < how_many; i++) {
1542 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1543 assert(ch <= to_maxchar);
1544 }
1545 }
1546#endif
1547
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001549 if (check_maxchar
1550 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1551 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001552 /* Writing Latin-1 characters into an ASCII string requires to
1553 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001554 Py_UCS4 max_char;
1555 max_char = ucs1lib_find_max_char(from_data,
1556 (Py_UCS1*)from_data + how_many);
1557 if (max_char >= 128)
1558 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 }
Christian Heimesf051e432016-09-13 20:22:02 +02001560 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001561 (char*)from_data + from_kind * from_start,
1562 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001564 else if (from_kind == PyUnicode_1BYTE_KIND
1565 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS1, Py_UCS2,
1569 PyUnicode_1BYTE_DATA(from) + from_start,
1570 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_2BYTE_DATA(to) + to_start
1572 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001573 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001574 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001575 && to_kind == PyUnicode_4BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS1, Py_UCS4,
1579 PyUnicode_1BYTE_DATA(from) + from_start,
1580 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_4BYTE_DATA(to) + to_start
1582 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001583 }
1584 else if (from_kind == PyUnicode_2BYTE_KIND
1585 && to_kind == PyUnicode_4BYTE_KIND)
1586 {
1587 _PyUnicode_CONVERT_BYTES(
1588 Py_UCS2, Py_UCS4,
1589 PyUnicode_2BYTE_DATA(from) + from_start,
1590 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1591 PyUnicode_4BYTE_DATA(to) + to_start
1592 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001593 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001594 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1596
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001597 if (!check_maxchar) {
1598 if (from_kind == PyUnicode_2BYTE_KIND
1599 && to_kind == PyUnicode_1BYTE_KIND)
1600 {
1601 _PyUnicode_CONVERT_BYTES(
1602 Py_UCS2, Py_UCS1,
1603 PyUnicode_2BYTE_DATA(from) + from_start,
1604 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1605 PyUnicode_1BYTE_DATA(to) + to_start
1606 );
1607 }
1608 else if (from_kind == PyUnicode_4BYTE_KIND
1609 && to_kind == PyUnicode_1BYTE_KIND)
1610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS4, Py_UCS1,
1613 PyUnicode_4BYTE_DATA(from) + from_start,
1614 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_1BYTE_DATA(to) + to_start
1616 );
1617 }
1618 else if (from_kind == PyUnicode_4BYTE_KIND
1619 && to_kind == PyUnicode_2BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS4, Py_UCS2,
1623 PyUnicode_4BYTE_DATA(from) + from_start,
1624 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_2BYTE_DATA(to) + to_start
1626 );
1627 }
1628 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001629 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001630 }
1631 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001632 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001633 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001634 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001635 Py_ssize_t i;
1636
Victor Stinnera0702ab2011-09-29 14:14:38 +02001637 for (i=0; i < how_many; i++) {
1638 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 if (ch > to_maxchar)
1640 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001641 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1642 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001643 }
1644 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 return 0;
1646}
1647
Victor Stinnerd3f08822012-05-29 12:57:52 +02001648void
1649_PyUnicode_FastCopyCharacters(
1650 PyObject *to, Py_ssize_t to_start,
1651 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652{
1653 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1654}
1655
1656Py_ssize_t
1657PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1658 PyObject *from, Py_ssize_t from_start,
1659 Py_ssize_t how_many)
1660{
1661 int err;
1662
1663 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1664 PyErr_BadInternalCall();
1665 return -1;
1666 }
1667
Benjamin Petersonbac79492012-01-14 13:34:47 -05001668 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001669 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001670 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001671 return -1;
1672
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001673 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001674 PyErr_SetString(PyExc_IndexError, "string index out of range");
1675 return -1;
1676 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001677 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001678 PyErr_SetString(PyExc_IndexError, "string index out of range");
1679 return -1;
1680 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001681 if (how_many < 0) {
1682 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1683 return -1;
1684 }
1685 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1687 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001688 "Cannot write %zi characters at %zi "
1689 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 how_many, to_start, PyUnicode_GET_LENGTH(to));
1691 return -1;
1692 }
1693
1694 if (how_many == 0)
1695 return 0;
1696
Victor Stinner488fa492011-12-12 00:01:39 +01001697 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001698 return -1;
1699
1700 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1701 if (err) {
1702 PyErr_Format(PyExc_SystemError,
1703 "Cannot copy %s characters "
1704 "into a string of %s characters",
1705 unicode_kind_name(from),
1706 unicode_kind_name(to));
1707 return -1;
1708 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001709 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710}
1711
Victor Stinner17222162011-09-28 22:15:37 +02001712/* Find the maximum code point and count the number of surrogate pairs so a
1713 correct string length can be computed before converting a string to UCS4.
1714 This function counts single surrogates as a character and not as a pair.
1715
1716 Return 0 on success, or -1 on error. */
1717static int
1718find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1719 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720{
1721 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001722 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723
Victor Stinnerc53be962011-10-02 21:33:54 +02001724 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 *num_surrogates = 0;
1726 *maxchar = 0;
1727
1728 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001730 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1731 && (iter+1) < end
1732 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1733 {
1734 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1735 ++(*num_surrogates);
1736 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 }
1738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001740 {
1741 ch = *iter;
1742 iter++;
1743 }
1744 if (ch > *maxchar) {
1745 *maxchar = ch;
1746 if (*maxchar > MAX_UNICODE) {
1747 PyErr_Format(PyExc_ValueError,
1748 "character U+%x is not in range [U+0000; U+10ffff]",
1749 ch);
1750 return -1;
1751 }
1752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 }
1754 return 0;
1755}
1756
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001757int
1758_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759{
1760 wchar_t *end;
1761 Py_UCS4 maxchar = 0;
1762 Py_ssize_t num_surrogates;
1763#if SIZEOF_WCHAR_T == 2
1764 Py_ssize_t length_wo_surrogates;
1765#endif
1766
Georg Brandl7597add2011-10-05 16:36:47 +02001767 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001768 strings were created using _PyObject_New() and where no canonical
1769 representation (the str field) has been set yet aka strings
1770 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001771 assert(_PyUnicode_CHECK(unicode));
1772 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001775 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001776 /* Actually, it should neither be interned nor be anything else: */
1777 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001780 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001781 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783
1784 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1786 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 PyErr_NoMemory();
1788 return -1;
1789 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001790 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 _PyUnicode_WSTR(unicode), end,
1792 PyUnicode_1BYTE_DATA(unicode));
1793 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1794 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1795 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1796 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001797 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001798 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001799 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001802 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001803 _PyUnicode_UTF8(unicode) = NULL;
1804 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 PyObject_FREE(_PyUnicode_WSTR(unicode));
1807 _PyUnicode_WSTR(unicode) = NULL;
1808 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1809 }
1810 /* In this case we might have to convert down from 4-byte native
1811 wchar_t to 2-byte unicode. */
1812 else if (maxchar < 65536) {
1813 assert(num_surrogates == 0 &&
1814 "FindMaxCharAndNumSurrogatePairs() messed up");
1815
Victor Stinner506f5922011-09-28 22:34:18 +02001816#if SIZEOF_WCHAR_T == 2
1817 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001819 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1820 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1821 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001822 _PyUnicode_UTF8(unicode) = NULL;
1823 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001824#else
1825 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001827 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001828 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001829 PyErr_NoMemory();
1830 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 }
Victor Stinner506f5922011-09-28 22:34:18 +02001832 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1833 _PyUnicode_WSTR(unicode), end,
1834 PyUnicode_2BYTE_DATA(unicode));
1835 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1836 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1837 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001838 _PyUnicode_UTF8(unicode) = NULL;
1839 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyObject_FREE(_PyUnicode_WSTR(unicode));
1841 _PyUnicode_WSTR(unicode) = NULL;
1842 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1843#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1846 else {
1847#if SIZEOF_WCHAR_T == 2
1848 /* in case the native representation is 2-bytes, we need to allocate a
1849 new normalized 4-byte version. */
1850 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001851 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1852 PyErr_NoMemory();
1853 return -1;
1854 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001855 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1856 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 PyErr_NoMemory();
1858 return -1;
1859 }
1860 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1861 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001862 _PyUnicode_UTF8(unicode) = NULL;
1863 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001864 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1865 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001866 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 PyObject_FREE(_PyUnicode_WSTR(unicode));
1868 _PyUnicode_WSTR(unicode) = NULL;
1869 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1870#else
1871 assert(num_surrogates == 0);
1872
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001875 _PyUnicode_UTF8(unicode) = NULL;
1876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1878#endif
1879 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1880 }
1881 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001882 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 return 0;
1884}
1885
Alexander Belopolsky40018472011-02-26 01:02:56 +00001886static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001887unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888{
Walter Dörwald16807132007-05-25 13:52:07 +00001889 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001890 case SSTATE_NOT_INTERNED:
1891 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001892
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 case SSTATE_INTERNED_MORTAL:
1894 /* revive dead object temporarily for DelItem */
1895 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001896 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 Py_FatalError(
1898 "deletion of interned string failed");
1899 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001900
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_INTERNED_IMMORTAL:
1902 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001903 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001904
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 default:
1906 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001907 }
1908
Victor Stinner03490912011-10-03 23:45:12 +02001909 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001911 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001912 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1914 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001916 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917}
1918
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001919#ifdef Py_DEBUG
1920static int
1921unicode_is_singleton(PyObject *unicode)
1922{
1923 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1924 if (unicode == unicode_empty)
1925 return 1;
1926 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1927 {
1928 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1929 if (ch < 256 && unicode_latin1[ch] == unicode)
1930 return 1;
1931 }
1932 return 0;
1933}
1934#endif
1935
Alexander Belopolsky40018472011-02-26 01:02:56 +00001936static int
Victor Stinner488fa492011-12-12 00:01:39 +01001937unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001938{
Victor Stinner488fa492011-12-12 00:01:39 +01001939 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 if (Py_REFCNT(unicode) != 1)
1941 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001942 if (_PyUnicode_HASH(unicode) != -1)
1943 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001944 if (PyUnicode_CHECK_INTERNED(unicode))
1945 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001946 if (!PyUnicode_CheckExact(unicode))
1947 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001948#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001949 /* singleton refcount is greater than 1 */
1950 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001951#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 1;
1953}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001954
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955static int
1956unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1957{
1958 PyObject *unicode;
1959 Py_ssize_t old_length;
1960
1961 assert(p_unicode != NULL);
1962 unicode = *p_unicode;
1963
1964 assert(unicode != NULL);
1965 assert(PyUnicode_Check(unicode));
1966 assert(0 <= length);
1967
Victor Stinner910337b2011-10-03 03:20:16 +02001968 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001969 old_length = PyUnicode_WSTR_LENGTH(unicode);
1970 else
1971 old_length = PyUnicode_GET_LENGTH(unicode);
1972 if (old_length == length)
1973 return 0;
1974
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001975 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001976 _Py_INCREF_UNICODE_EMPTY();
1977 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001978 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001979 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001980 return 0;
1981 }
1982
Victor Stinner488fa492011-12-12 00:01:39 +01001983 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 PyObject *copy = resize_copy(unicode, length);
1985 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001986 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001987 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001989 }
1990
Victor Stinnerfe226c02011-10-03 03:52:20 +02001991 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001992 PyObject *new_unicode = resize_compact(unicode, length);
1993 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001994 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001995 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001996 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001998 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999}
2000
Alexander Belopolsky40018472011-02-26 01:02:56 +00002001int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002002PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002003{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002004 PyObject *unicode;
2005 if (p_unicode == NULL) {
2006 PyErr_BadInternalCall();
2007 return -1;
2008 }
2009 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002010 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 {
2012 PyErr_BadInternalCall();
2013 return -1;
2014 }
2015 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002016}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002017
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002018/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002019
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002020 WARNING: The function doesn't copy the terminating null character and
2021 doesn't check the maximum character (may write a latin1 character in an
2022 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002023static void
2024unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2025 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002026{
2027 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2028 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002029 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002030
2031 switch (kind) {
2032 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002033 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002034#ifdef Py_DEBUG
2035 if (PyUnicode_IS_ASCII(unicode)) {
2036 Py_UCS4 maxchar = ucs1lib_find_max_char(
2037 (const Py_UCS1*)str,
2038 (const Py_UCS1*)str + len);
2039 assert(maxchar < 128);
2040 }
2041#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002042 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002043 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002044 }
2045 case PyUnicode_2BYTE_KIND: {
2046 Py_UCS2 *start = (Py_UCS2 *)data + index;
2047 Py_UCS2 *ucs2 = start;
2048 assert(index <= PyUnicode_GET_LENGTH(unicode));
2049
Victor Stinner184252a2012-06-16 02:57:41 +02002050 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002051 *ucs2 = (Py_UCS2)*str;
2052
2053 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002054 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002055 }
2056 default: {
2057 Py_UCS4 *start = (Py_UCS4 *)data + index;
2058 Py_UCS4 *ucs4 = start;
2059 assert(kind == PyUnicode_4BYTE_KIND);
2060 assert(index <= PyUnicode_GET_LENGTH(unicode));
2061
Victor Stinner184252a2012-06-16 02:57:41 +02002062 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002063 *ucs4 = (Py_UCS4)*str;
2064
2065 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 }
2067 }
2068}
2069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070static PyObject*
2071get_latin1_char(unsigned char ch)
2072{
Victor Stinnera464fc12011-10-02 20:39:30 +02002073 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002075 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 if (!unicode)
2077 return NULL;
2078 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002079 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 unicode_latin1[ch] = unicode;
2081 }
2082 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002083 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084}
2085
Victor Stinner985a82a2014-01-03 12:53:47 +01002086static PyObject*
2087unicode_char(Py_UCS4 ch)
2088{
2089 PyObject *unicode;
2090
2091 assert(ch <= MAX_UNICODE);
2092
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002093 if (ch < 256)
2094 return get_latin1_char(ch);
2095
Victor Stinner985a82a2014-01-03 12:53:47 +01002096 unicode = PyUnicode_New(1, ch);
2097 if (unicode == NULL)
2098 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002099
2100 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2101 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002102 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002103 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002104 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2105 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2106 }
2107 assert(_PyUnicode_CheckConsistency(unicode, 1));
2108 return unicode;
2109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002114 if (u == NULL)
2115 return (PyObject*)_PyUnicode_New(size);
2116
2117 if (size < 0) {
2118 PyErr_BadInternalCall();
2119 return NULL;
2120 }
2121
2122 return PyUnicode_FromWideChar(u, size);
2123}
2124
2125PyObject *
2126PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2127{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002128 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 Py_UCS4 maxchar = 0;
2130 Py_ssize_t num_surrogates;
2131
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002132 if (u == NULL && size != 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 if (size == -1) {
2138 size = wcslen(u);
2139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002141 /* If the Unicode data is known at construction time, we can apply
2142 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002145 if (size == 0)
2146 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 /* Single character Unicode objects in the Latin-1 range are
2149 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 return get_latin1_char((unsigned char)*u);
2152
2153 /* If not empty and not single character, copy the Unicode data
2154 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002155 if (find_maxchar_surrogates(u, u + size,
2156 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 return NULL;
2158
Victor Stinner8faf8212011-12-08 22:14:11 +01002159 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 if (!unicode)
2161 return NULL;
2162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 switch (PyUnicode_KIND(unicode)) {
2164 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002165 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2167 break;
2168 case PyUnicode_2BYTE_KIND:
2169#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002170 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002172 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2174#endif
2175 break;
2176 case PyUnicode_4BYTE_KIND:
2177#if SIZEOF_WCHAR_T == 2
2178 /* This is the only case which has to process surrogates, thus
2179 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002180 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181#else
2182 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002183 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184#endif
2185 break;
2186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002187 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002190 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191}
2192
Alexander Belopolsky40018472011-02-26 01:02:56 +00002193PyObject *
2194PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002195{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002196 if (size < 0) {
2197 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 return NULL;
2200 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002201 if (u != NULL)
2202 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2203 else
2204 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002205}
2206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002209{
2210 size_t size = strlen(u);
2211 if (size > PY_SSIZE_T_MAX) {
2212 PyErr_SetString(PyExc_OverflowError, "input too long");
2213 return NULL;
2214 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002215 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002216}
2217
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002218PyObject *
2219_PyUnicode_FromId(_Py_Identifier *id)
2220{
2221 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002222 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2223 strlen(id->string),
2224 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002225 if (!id->object)
2226 return NULL;
2227 PyUnicode_InternInPlace(&id->object);
2228 assert(!id->next);
2229 id->next = static_strings;
2230 static_strings = id;
2231 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002232 return id->object;
2233}
2234
2235void
2236_PyUnicode_ClearStaticStrings()
2237{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002238 _Py_Identifier *tmp, *s = static_strings;
2239 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002240 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002241 tmp = s->next;
2242 s->next = NULL;
2243 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002244 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002245 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002246}
2247
Benjamin Peterson0df54292012-03-26 14:50:32 -04002248/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Victor Stinnerd3f08822012-05-29 12:57:52 +02002250PyObject*
2251_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002252{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002253 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002254 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002255 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002256#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002257 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002258#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002259 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002260 }
Victor Stinner785938e2011-12-11 20:09:03 +01002261 unicode = PyUnicode_New(size, 127);
2262 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002263 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002264 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2265 assert(_PyUnicode_CheckConsistency(unicode, 1));
2266 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002267}
2268
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002269static Py_UCS4
2270kind_maxchar_limit(unsigned int kind)
2271{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002272 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002273 case PyUnicode_1BYTE_KIND:
2274 return 0x80;
2275 case PyUnicode_2BYTE_KIND:
2276 return 0x100;
2277 case PyUnicode_4BYTE_KIND:
2278 return 0x10000;
2279 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002280 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002281 }
2282}
2283
Victor Stinner702c7342011-10-05 13:50:52 +02002284static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002285_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002288 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002289
Serhiy Storchaka678db842013-01-26 12:16:36 +02002290 if (size == 0)
2291 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002292 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002293 if (size == 1)
2294 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002296 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002297 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 if (!res)
2299 return NULL;
2300 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002301 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303}
2304
Victor Stinnere57b1c02011-09-28 22:20:48 +02002305static PyObject*
2306_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307{
2308 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002309 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002310
Serhiy Storchaka678db842013-01-26 12:16:36 +02002311 if (size == 0)
2312 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002313 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002314 if (size == 1)
2315 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002317 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002318 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319 if (!res)
2320 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002321 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 else {
2324 _PyUnicode_CONVERT_BYTES(
2325 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2326 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002327 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 return res;
2329}
2330
Victor Stinnere57b1c02011-09-28 22:20:48 +02002331static PyObject*
2332_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333{
2334 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002335 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336
Serhiy Storchaka678db842013-01-26 12:16:36 +02002337 if (size == 0)
2338 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002339 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002340 if (size == 1)
2341 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002342
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002343 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002344 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (!res)
2346 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002347 if (max_char < 256)
2348 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2349 PyUnicode_1BYTE_DATA(res));
2350 else if (max_char < 0x10000)
2351 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2352 PyUnicode_2BYTE_DATA(res));
2353 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002355 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 return res;
2357}
2358
2359PyObject*
2360PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2361{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002362 if (size < 0) {
2363 PyErr_SetString(PyExc_ValueError, "size must be positive");
2364 return NULL;
2365 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002366 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002368 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002370 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002372 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002373 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002374 PyErr_SetString(PyExc_SystemError, "invalid kind");
2375 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377}
2378
Victor Stinnerece58de2012-04-23 23:36:38 +02002379Py_UCS4
2380_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2381{
2382 enum PyUnicode_Kind kind;
2383 void *startptr, *endptr;
2384
2385 assert(PyUnicode_IS_READY(unicode));
2386 assert(0 <= start);
2387 assert(end <= PyUnicode_GET_LENGTH(unicode));
2388 assert(start <= end);
2389
2390 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2391 return PyUnicode_MAX_CHAR_VALUE(unicode);
2392
2393 if (start == end)
2394 return 127;
2395
Victor Stinner94d558b2012-04-27 22:26:58 +02002396 if (PyUnicode_IS_ASCII(unicode))
2397 return 127;
2398
Victor Stinnerece58de2012-04-23 23:36:38 +02002399 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002400 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002401 endptr = (char *)startptr + end * kind;
2402 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002403 switch(kind) {
2404 case PyUnicode_1BYTE_KIND:
2405 return ucs1lib_find_max_char(startptr, endptr);
2406 case PyUnicode_2BYTE_KIND:
2407 return ucs2lib_find_max_char(startptr, endptr);
2408 case PyUnicode_4BYTE_KIND:
2409 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002410 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002411 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002412 }
2413}
2414
Victor Stinner25a4b292011-10-06 12:31:55 +02002415/* Ensure that a string uses the most efficient storage, if it is not the
2416 case: create a new string with of the right kind. Write NULL into *p_unicode
2417 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002418static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002419unicode_adjust_maxchar(PyObject **p_unicode)
2420{
2421 PyObject *unicode, *copy;
2422 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002424 unsigned int kind;
2425
2426 assert(p_unicode != NULL);
2427 unicode = *p_unicode;
2428 assert(PyUnicode_IS_READY(unicode));
2429 if (PyUnicode_IS_ASCII(unicode))
2430 return;
2431
2432 len = PyUnicode_GET_LENGTH(unicode);
2433 kind = PyUnicode_KIND(unicode);
2434 if (kind == PyUnicode_1BYTE_KIND) {
2435 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002436 max_char = ucs1lib_find_max_char(u, u + len);
2437 if (max_char >= 128)
2438 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 }
2440 else if (kind == PyUnicode_2BYTE_KIND) {
2441 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002442 max_char = ucs2lib_find_max_char(u, u + len);
2443 if (max_char >= 256)
2444 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002445 }
2446 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002448 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002449 max_char = ucs4lib_find_max_char(u, u + len);
2450 if (max_char >= 0x10000)
2451 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002452 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002453 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002454 if (copy != NULL)
2455 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002456 Py_DECREF(unicode);
2457 *p_unicode = copy;
2458}
2459
Victor Stinner034f6cf2011-09-30 02:26:44 +02002460PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002461_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002462{
Victor Stinner87af4f22011-11-21 23:03:47 +01002463 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002464 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002465
Victor Stinner034f6cf2011-09-30 02:26:44 +02002466 if (!PyUnicode_Check(unicode)) {
2467 PyErr_BadInternalCall();
2468 return NULL;
2469 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002470 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002471 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002472
Victor Stinner87af4f22011-11-21 23:03:47 +01002473 length = PyUnicode_GET_LENGTH(unicode);
2474 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002475 if (!copy)
2476 return NULL;
2477 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2478
Christian Heimesf051e432016-09-13 20:22:02 +02002479 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002480 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002481 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002482 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002483}
2484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485
Victor Stinnerbc603d12011-10-02 01:00:40 +02002486/* Widen Unicode objects to larger buffers. Don't write terminating null
2487 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488
2489void*
2490_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2491{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002492 Py_ssize_t len;
2493 void *result;
2494 unsigned int skind;
2495
Benjamin Petersonbac79492012-01-14 13:34:47 -05002496 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002497 return NULL;
2498
2499 len = PyUnicode_GET_LENGTH(s);
2500 skind = PyUnicode_KIND(s);
2501 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002502 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 return NULL;
2504 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002505 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002506 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002507 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002508 if (!result)
2509 return PyErr_NoMemory();
2510 assert(skind == PyUnicode_1BYTE_KIND);
2511 _PyUnicode_CONVERT_BYTES(
2512 Py_UCS1, Py_UCS2,
2513 PyUnicode_1BYTE_DATA(s),
2514 PyUnicode_1BYTE_DATA(s) + len,
2515 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002518 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002519 if (!result)
2520 return PyErr_NoMemory();
2521 if (skind == PyUnicode_2BYTE_KIND) {
2522 _PyUnicode_CONVERT_BYTES(
2523 Py_UCS2, Py_UCS4,
2524 PyUnicode_2BYTE_DATA(s),
2525 PyUnicode_2BYTE_DATA(s) + len,
2526 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002528 else {
2529 assert(skind == PyUnicode_1BYTE_KIND);
2530 _PyUnicode_CONVERT_BYTES(
2531 Py_UCS1, Py_UCS4,
2532 PyUnicode_1BYTE_DATA(s),
2533 PyUnicode_1BYTE_DATA(s) + len,
2534 result);
2535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002537 default:
2538 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 }
Victor Stinner01698042011-10-04 00:04:26 +02002540 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 return NULL;
2542}
2543
2544static Py_UCS4*
2545as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2546 int copy_null)
2547{
2548 int kind;
2549 void *data;
2550 Py_ssize_t len, targetlen;
2551 if (PyUnicode_READY(string) == -1)
2552 return NULL;
2553 kind = PyUnicode_KIND(string);
2554 data = PyUnicode_DATA(string);
2555 len = PyUnicode_GET_LENGTH(string);
2556 targetlen = len;
2557 if (copy_null)
2558 targetlen++;
2559 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002560 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 if (!target) {
2562 PyErr_NoMemory();
2563 return NULL;
2564 }
2565 }
2566 else {
2567 if (targetsize < targetlen) {
2568 PyErr_Format(PyExc_SystemError,
2569 "string is longer than the buffer");
2570 if (copy_null && 0 < targetsize)
2571 target[0] = 0;
2572 return NULL;
2573 }
2574 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002575 if (kind == PyUnicode_1BYTE_KIND) {
2576 Py_UCS1 *start = (Py_UCS1 *) data;
2577 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002579 else if (kind == PyUnicode_2BYTE_KIND) {
2580 Py_UCS2 *start = (Py_UCS2 *) data;
2581 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2582 }
2583 else {
2584 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002585 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 if (copy_null)
2588 target[len] = 0;
2589 return target;
2590}
2591
2592Py_UCS4*
2593PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2594 int copy_null)
2595{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002596 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 PyErr_BadInternalCall();
2598 return NULL;
2599 }
2600 return as_ucs4(string, target, targetsize, copy_null);
2601}
2602
2603Py_UCS4*
2604PyUnicode_AsUCS4Copy(PyObject *string)
2605{
2606 return as_ucs4(string, NULL, 0, 1);
2607}
2608
Victor Stinner15a11362012-10-06 23:48:20 +02002609/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002610 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2611 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2612#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002613
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614static int
2615unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2616 Py_ssize_t width, Py_ssize_t precision)
2617{
2618 Py_ssize_t length, fill, arglen;
2619 Py_UCS4 maxchar;
2620
2621 if (PyUnicode_READY(str) == -1)
2622 return -1;
2623
2624 length = PyUnicode_GET_LENGTH(str);
2625 if ((precision == -1 || precision >= length)
2626 && width <= length)
2627 return _PyUnicodeWriter_WriteStr(writer, str);
2628
2629 if (precision != -1)
2630 length = Py_MIN(precision, length);
2631
2632 arglen = Py_MAX(length, width);
2633 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2634 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2635 else
2636 maxchar = writer->maxchar;
2637
2638 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2639 return -1;
2640
2641 if (width > length) {
2642 fill = width - length;
2643 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2644 return -1;
2645 writer->pos += fill;
2646 }
2647
2648 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2649 str, 0, length);
2650 writer->pos += length;
2651 return 0;
2652}
2653
2654static int
Victor Stinner998b8062018-09-12 00:23:25 +02002655unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002656 Py_ssize_t width, Py_ssize_t precision)
2657{
2658 /* UTF-8 */
2659 Py_ssize_t length;
2660 PyObject *unicode;
2661 int res;
2662
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002663 if (precision == -1) {
2664 length = strlen(str);
2665 }
2666 else {
2667 length = 0;
2668 while (length < precision && str[length]) {
2669 length++;
2670 }
2671 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2673 if (unicode == NULL)
2674 return -1;
2675
2676 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2677 Py_DECREF(unicode);
2678 return res;
2679}
2680
Victor Stinner96865452011-03-01 23:44:09 +00002681static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002682unicode_fromformat_arg(_PyUnicodeWriter *writer,
2683 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002684{
Victor Stinnere215d962012-10-06 23:03:36 +02002685 const char *p;
2686 Py_ssize_t len;
2687 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 Py_ssize_t width;
2689 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002690 int longflag;
2691 int longlongflag;
2692 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002693 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 p = f;
2696 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002697 zeropad = 0;
2698 if (*f == '0') {
2699 zeropad = 1;
2700 f++;
2701 }
Victor Stinner96865452011-03-01 23:44:09 +00002702
2703 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 width = -1;
2705 if (Py_ISDIGIT((unsigned)*f)) {
2706 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002707 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002708 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002710 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002712 return NULL;
2713 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002715 f++;
2716 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 }
2718 precision = -1;
2719 if (*f == '.') {
2720 f++;
2721 if (Py_ISDIGIT((unsigned)*f)) {
2722 precision = (*f - '0');
2723 f++;
2724 while (Py_ISDIGIT((unsigned)*f)) {
2725 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2726 PyErr_SetString(PyExc_ValueError,
2727 "precision too big");
2728 return NULL;
2729 }
2730 precision = (precision * 10) + (*f - '0');
2731 f++;
2732 }
2733 }
Victor Stinner96865452011-03-01 23:44:09 +00002734 if (*f == '%') {
2735 /* "%.3%s" => f points to "3" */
2736 f--;
2737 }
2738 }
2739 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002740 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002741 f--;
2742 }
Victor Stinner96865452011-03-01 23:44:09 +00002743
2744 /* Handle %ld, %lu, %lld and %llu. */
2745 longflag = 0;
2746 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002747 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002748 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002749 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002750 longflag = 1;
2751 ++f;
2752 }
Victor Stinner96865452011-03-01 23:44:09 +00002753 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002754 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002755 longlongflag = 1;
2756 f += 2;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758 }
2759 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002761 size_tflag = 1;
2762 ++f;
2763 }
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 if (f[1] == '\0')
2766 writer->overallocate = 0;
2767
2768 switch (*f) {
2769 case 'c':
2770 {
2771 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002772 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002773 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002774 "character argument not in range(0x110000)");
2775 return NULL;
2776 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002777 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002778 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002779 break;
2780 }
2781
2782 case 'i':
2783 case 'd':
2784 case 'u':
2785 case 'x':
2786 {
2787 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002788 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002790
2791 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002792 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002793 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002794 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002795 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002796 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002797 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002798 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002799 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002800 va_arg(*vargs, size_t));
2801 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002802 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002803 va_arg(*vargs, unsigned int));
2804 }
2805 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002806 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002807 }
2808 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002809 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002810 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002811 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002812 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002813 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002814 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002815 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002816 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002817 va_arg(*vargs, Py_ssize_t));
2818 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002819 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002820 va_arg(*vargs, int));
2821 }
2822 assert(len >= 0);
2823
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (precision < len)
2825 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826
2827 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002828 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2829 return NULL;
2830
Victor Stinnere215d962012-10-06 23:03:36 +02002831 if (width > precision) {
2832 Py_UCS4 fillchar;
2833 fill = width - precision;
2834 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002835 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2836 return NULL;
2837 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 }
Victor Stinner15a11362012-10-06 23:48:20 +02002839 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002840 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002841 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2842 return NULL;
2843 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002844 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002845
Victor Stinner4a587072013-11-19 12:54:53 +01002846 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849 }
2850
2851 case 'p':
2852 {
2853 char number[MAX_LONG_LONG_CHARS];
2854
2855 len = sprintf(number, "%p", va_arg(*vargs, void*));
2856 assert(len >= 0);
2857
2858 /* %p is ill-defined: ensure leading 0x. */
2859 if (number[1] == 'X')
2860 number[1] = 'x';
2861 else if (number[1] != 'x') {
2862 memmove(number + 2, number,
2863 strlen(number) + 1);
2864 number[0] = '0';
2865 number[1] = 'x';
2866 len += 2;
2867 }
2868
Victor Stinner4a587072013-11-19 12:54:53 +01002869 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002870 return NULL;
2871 break;
2872 }
2873
2874 case 's':
2875 {
2876 /* UTF-8 */
2877 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002878 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002879 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002880 break;
2881 }
2882
2883 case 'U':
2884 {
2885 PyObject *obj = va_arg(*vargs, PyObject *);
2886 assert(obj && _PyUnicode_CHECK(obj));
2887
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002888 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002889 return NULL;
2890 break;
2891 }
2892
2893 case 'V':
2894 {
2895 PyObject *obj = va_arg(*vargs, PyObject *);
2896 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002897 if (obj) {
2898 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002900 return NULL;
2901 }
2902 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002904 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002906 }
2907 break;
2908 }
2909
2910 case 'S':
2911 {
2912 PyObject *obj = va_arg(*vargs, PyObject *);
2913 PyObject *str;
2914 assert(obj);
2915 str = PyObject_Str(obj);
2916 if (!str)
2917 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002919 Py_DECREF(str);
2920 return NULL;
2921 }
2922 Py_DECREF(str);
2923 break;
2924 }
2925
2926 case 'R':
2927 {
2928 PyObject *obj = va_arg(*vargs, PyObject *);
2929 PyObject *repr;
2930 assert(obj);
2931 repr = PyObject_Repr(obj);
2932 if (!repr)
2933 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002934 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002935 Py_DECREF(repr);
2936 return NULL;
2937 }
2938 Py_DECREF(repr);
2939 break;
2940 }
2941
2942 case 'A':
2943 {
2944 PyObject *obj = va_arg(*vargs, PyObject *);
2945 PyObject *ascii;
2946 assert(obj);
2947 ascii = PyObject_ASCII(obj);
2948 if (!ascii)
2949 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 Py_DECREF(ascii);
2952 return NULL;
2953 }
2954 Py_DECREF(ascii);
2955 break;
2956 }
2957
2958 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002959 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002960 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002961 break;
2962
2963 default:
2964 /* if we stumble upon an unknown formatting code, copy the rest
2965 of the format string to the output string. (we cannot just
2966 skip the code, since there's no way to know what's in the
2967 argument list) */
2968 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002969 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002970 return NULL;
2971 f = p+len;
2972 return f;
2973 }
2974
2975 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002976 return f;
2977}
2978
Walter Dörwaldd2034312007-05-18 16:29:38 +00002979PyObject *
2980PyUnicode_FromFormatV(const char *format, va_list vargs)
2981{
Victor Stinnere215d962012-10-06 23:03:36 +02002982 va_list vargs2;
2983 const char *f;
2984 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002985
Victor Stinner8f674cc2013-04-17 23:02:17 +02002986 _PyUnicodeWriter_Init(&writer);
2987 writer.min_length = strlen(format) + 100;
2988 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002989
Benjamin Peterson0c212142016-09-20 20:39:33 -07002990 // Copy varags to be able to pass a reference to a subfunction.
2991 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002992
2993 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002994 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002995 f = unicode_fromformat_arg(&writer, f, &vargs2);
2996 if (f == NULL)
2997 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003000 const char *p;
3001 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002
Victor Stinnere215d962012-10-06 23:03:36 +02003003 p = f;
3004 do
3005 {
3006 if ((unsigned char)*p > 127) {
3007 PyErr_Format(PyExc_ValueError,
3008 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3009 "string, got a non-ASCII byte: 0x%02x",
3010 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003011 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003012 }
3013 p++;
3014 }
3015 while (*p != '\0' && *p != '%');
3016 len = p - f;
3017
3018 if (*p == '\0')
3019 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003020
3021 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003022 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003023
3024 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003026 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003027 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003028 return _PyUnicodeWriter_Finish(&writer);
3029
3030 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003031 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003032 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003033 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003034}
3035
Walter Dörwaldd2034312007-05-18 16:29:38 +00003036PyObject *
3037PyUnicode_FromFormat(const char *format, ...)
3038{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003039 PyObject* ret;
3040 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003041
3042#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003043 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003044#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003046#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003047 ret = PyUnicode_FromFormatV(format, vargs);
3048 va_end(vargs);
3049 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050}
3051
Serhiy Storchakac46db922018-10-23 22:58:24 +03003052static Py_ssize_t
3053unicode_get_widechar_size(PyObject *unicode)
3054{
3055 Py_ssize_t res;
3056
3057 assert(unicode != NULL);
3058 assert(_PyUnicode_CHECK(unicode));
3059
3060 if (_PyUnicode_WSTR(unicode) != NULL) {
3061 return PyUnicode_WSTR_LENGTH(unicode);
3062 }
3063 assert(PyUnicode_IS_READY(unicode));
3064
3065 res = _PyUnicode_LENGTH(unicode);
3066#if SIZEOF_WCHAR_T == 2
3067 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3068 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3069 const Py_UCS4 *end = s + res;
3070 for (; s < end; ++s) {
3071 if (*s > 0xFFFF) {
3072 ++res;
3073 }
3074 }
3075 }
3076#endif
3077 return res;
3078}
3079
3080static void
3081unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3082{
3083 const wchar_t *wstr;
3084
3085 assert(unicode != NULL);
3086 assert(_PyUnicode_CHECK(unicode));
3087
3088 wstr = _PyUnicode_WSTR(unicode);
3089 if (wstr != NULL) {
3090 memcpy(w, wstr, size * sizeof(wchar_t));
3091 return;
3092 }
3093 assert(PyUnicode_IS_READY(unicode));
3094
3095 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3096 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3097 for (; size--; ++s, ++w) {
3098 *w = *s;
3099 }
3100 }
3101 else {
3102#if SIZEOF_WCHAR_T == 4
3103 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3104 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3105 for (; size--; ++s, ++w) {
3106 *w = *s;
3107 }
3108#else
3109 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3110 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3111 for (; size--; ++s, ++w) {
3112 Py_UCS4 ch = *s;
3113 if (ch > 0xFFFF) {
3114 assert(ch <= MAX_UNICODE);
3115 /* encode surrogate pair in this case */
3116 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3117 if (!size--)
3118 break;
3119 *w = Py_UNICODE_LOW_SURROGATE(ch);
3120 }
3121 else {
3122 *w = ch;
3123 }
3124 }
3125#endif
3126 }
3127}
3128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003129#ifdef HAVE_WCHAR_H
3130
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003131/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003132
Victor Stinnerd88d9832011-09-06 02:00:05 +02003133 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003134 character) required to convert the unicode object. Ignore size argument.
3135
Victor Stinnerd88d9832011-09-06 02:00:05 +02003136 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003137 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003138 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139Py_ssize_t
3140PyUnicode_AsWideChar(PyObject *unicode,
3141 wchar_t *w,
3142 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003143{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003144 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146 if (unicode == NULL) {
3147 PyErr_BadInternalCall();
3148 return -1;
3149 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003150 if (!PyUnicode_Check(unicode)) {
3151 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003153 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003154
3155 res = unicode_get_widechar_size(unicode);
3156 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003158 }
3159
3160 if (size > res) {
3161 size = res + 1;
3162 }
3163 else {
3164 res = size;
3165 }
3166 unicode_copy_as_widechar(unicode, w, size);
3167 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003168}
3169
Victor Stinner137c34c2010-09-29 10:25:54 +00003170wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003171PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003172 Py_ssize_t *size)
3173{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003174 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003175 Py_ssize_t buflen;
3176
3177 if (unicode == NULL) {
3178 PyErr_BadInternalCall();
3179 return NULL;
3180 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003181 if (!PyUnicode_Check(unicode)) {
3182 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003183 return NULL;
3184 }
3185
Serhiy Storchakac46db922018-10-23 22:58:24 +03003186 buflen = unicode_get_widechar_size(unicode);
3187 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003188 if (buffer == NULL) {
3189 PyErr_NoMemory();
3190 return NULL;
3191 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003192 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3193 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003194 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003195 }
3196 else if (wcslen(buffer) != (size_t)buflen) {
3197 PyMem_FREE(buffer);
3198 PyErr_SetString(PyExc_ValueError,
3199 "embedded null character");
3200 return NULL;
3201 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003202 return buffer;
3203}
3204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003205#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003209{
Victor Stinner8faf8212011-12-08 22:14:11 +01003210 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 PyErr_SetString(PyExc_ValueError,
3212 "chr() arg not in range(0x110000)");
3213 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003214 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003215
Victor Stinner985a82a2014-01-03 12:53:47 +01003216 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003217}
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003220PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003222 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003224 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003225 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003226 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 Py_INCREF(obj);
3228 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003229 }
3230 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 /* For a Unicode subtype that's not a Unicode object,
3232 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003233 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003234 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003235 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003236 "Can't convert '%.100s' object to str implicitly",
3237 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003238 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003239}
3240
Alexander Belopolsky40018472011-02-26 01:02:56 +00003241PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003242PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003243 const char *encoding,
3244 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003245{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003246 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003247 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 PyErr_BadInternalCall();
3251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003253
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003254 /* Decoding bytes objects is the most common case and should be fast */
3255 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003256 if (PyBytes_GET_SIZE(obj) == 0) {
3257 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3258 return NULL;
3259 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003260 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003261 }
3262 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003263 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3264 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003265 }
3266
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003267 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 PyErr_SetString(PyExc_TypeError,
3269 "decoding str is not supported");
3270 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003271 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003272
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003273 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3274 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3275 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003276 "decoding to str: need a bytes-like object, %.80s found",
3277 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003278 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003279 }
Tim Petersced69f82003-09-16 20:30:58 +00003280
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003281 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003282 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003283 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3284 return NULL;
3285 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003286 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003288
Serhiy Storchaka05997252013-01-26 12:14:02 +02003289 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003290 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003291 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292}
3293
Victor Stinnerebe17e02016-10-12 13:57:45 +02003294/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3295 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3296 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003297int
3298_Py_normalize_encoding(const char *encoding,
3299 char *lower,
3300 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003302 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003303 char *l;
3304 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003305 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306
Victor Stinner942889a2016-09-05 15:40:10 -07003307 assert(encoding != NULL);
3308
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003309 e = encoding;
3310 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003311 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003312 punct = 0;
3313 while (1) {
3314 char c = *e;
3315 if (c == 0) {
3316 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 }
Victor Stinner942889a2016-09-05 15:40:10 -07003318
3319 if (Py_ISALNUM(c) || c == '.') {
3320 if (punct && l != lower) {
3321 if (l == l_end) {
3322 return 0;
3323 }
3324 *l++ = '_';
3325 }
3326 punct = 0;
3327
3328 if (l == l_end) {
3329 return 0;
3330 }
3331 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
3333 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003334 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003335 }
Victor Stinner942889a2016-09-05 15:40:10 -07003336
3337 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003338 }
3339 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003340 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003341}
3342
Alexander Belopolsky40018472011-02-26 01:02:56 +00003343PyObject *
3344PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003345 Py_ssize_t size,
3346 const char *encoding,
3347 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003348{
3349 PyObject *buffer = NULL, *unicode;
3350 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003351 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3352
Victor Stinner22eb6892019-06-26 00:51:05 +02003353 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3354 return NULL;
3355 }
3356
Victor Stinnered076ed2019-06-26 01:49:32 +02003357 if (size == 0) {
3358 _Py_RETURN_UNICODE_EMPTY();
3359 }
3360
Victor Stinner942889a2016-09-05 15:40:10 -07003361 if (encoding == NULL) {
3362 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3363 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003364
Fred Drakee4315f52000-05-09 19:53:39 +00003365 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003366 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3367 char *lower = buflower;
3368
3369 /* Fast paths */
3370 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3371 lower += 3;
3372 if (*lower == '_') {
3373 /* Match "utf8" and "utf_8" */
3374 lower++;
3375 }
3376
3377 if (lower[0] == '8' && lower[1] == 0) {
3378 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3379 }
3380 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3381 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3382 }
3383 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3384 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3385 }
3386 }
3387 else {
3388 if (strcmp(lower, "ascii") == 0
3389 || strcmp(lower, "us_ascii") == 0) {
3390 return PyUnicode_DecodeASCII(s, size, errors);
3391 }
Steve Dowercc16be82016-09-08 10:35:16 -07003392 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003393 else if (strcmp(lower, "mbcs") == 0) {
3394 return PyUnicode_DecodeMBCS(s, size, errors);
3395 }
3396 #endif
3397 else if (strcmp(lower, "latin1") == 0
3398 || strcmp(lower, "latin_1") == 0
3399 || strcmp(lower, "iso_8859_1") == 0
3400 || strcmp(lower, "iso8859_1") == 0) {
3401 return PyUnicode_DecodeLatin1(s, size, errors);
3402 }
3403 }
Victor Stinner37296e82010-06-10 13:36:23 +00003404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405
3406 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003407 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003408 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003409 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003410 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 if (buffer == NULL)
3412 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003413 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 if (unicode == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(unicode)) {
3417 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003418 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003419 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003420 encoding,
3421 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 Py_DECREF(unicode);
3423 goto onError;
3424 }
3425 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003426 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003427
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 Py_XDECREF(buffer);
3430 return NULL;
3431}
3432
Alexander Belopolsky40018472011-02-26 01:02:56 +00003433PyObject *
3434PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003435 const char *encoding,
3436 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003437{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003438 if (!PyUnicode_Check(unicode)) {
3439 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003440 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003441 }
3442
Serhiy Storchaka00939072016-10-27 21:05:49 +03003443 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3444 "PyUnicode_AsDecodedObject() is deprecated; "
3445 "use PyCodec_Decode() to decode from str", 1) < 0)
3446 return NULL;
3447
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003448 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003450
3451 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003452 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453}
3454
Alexander Belopolsky40018472011-02-26 01:02:56 +00003455PyObject *
3456PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003457 const char *encoding,
3458 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003459{
3460 PyObject *v;
3461
3462 if (!PyUnicode_Check(unicode)) {
3463 PyErr_BadArgument();
3464 goto onError;
3465 }
3466
Serhiy Storchaka00939072016-10-27 21:05:49 +03003467 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3468 "PyUnicode_AsDecodedUnicode() is deprecated; "
3469 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3470 return NULL;
3471
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474
3475 /* Decode via the codec registry */
3476 v = PyCodec_Decode(unicode, encoding, errors);
3477 if (v == NULL)
3478 goto onError;
3479 if (!PyUnicode_Check(v)) {
3480 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003481 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003482 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003483 encoding,
3484 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485 Py_DECREF(v);
3486 goto onError;
3487 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003488 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491 return NULL;
3492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494PyObject *
3495PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003496 Py_ssize_t size,
3497 const char *encoding,
3498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
3500 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003501
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003502 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3506 Py_DECREF(unicode);
3507 return v;
3508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 const char *encoding,
3513 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003514{
3515 PyObject *v;
3516
3517 if (!PyUnicode_Check(unicode)) {
3518 PyErr_BadArgument();
3519 goto onError;
3520 }
3521
Serhiy Storchaka00939072016-10-27 21:05:49 +03003522 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3523 "PyUnicode_AsEncodedObject() is deprecated; "
3524 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3525 "or PyCodec_Encode() for generic encoding", 1) < 0)
3526 return NULL;
3527
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003528 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003530
3531 /* Encode via the codec registry */
3532 v = PyCodec_Encode(unicode, encoding, errors);
3533 if (v == NULL)
3534 goto onError;
3535 return v;
3536
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003538 return NULL;
3539}
3540
Victor Stinner1b579672011-12-17 05:47:23 +01003541
Victor Stinner2cba6b82018-01-10 22:46:15 +01003542static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003543unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003544 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003545{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003546 Py_ssize_t wlen;
3547 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3548 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003549 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003550 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003551
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003552 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003553 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003554 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003555 return NULL;
3556 }
3557
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003558 char *str;
3559 size_t error_pos;
3560 const char *reason;
3561 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003562 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003563 PyMem_Free(wstr);
3564
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 if (res != 0) {
3566 if (res == -2) {
3567 PyObject *exc;
3568 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3569 "locale", unicode,
3570 (Py_ssize_t)error_pos,
3571 (Py_ssize_t)(error_pos+1),
3572 reason);
3573 if (exc != NULL) {
3574 PyCodec_StrictErrors(exc);
3575 Py_DECREF(exc);
3576 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003577 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003578 else if (res == -3) {
3579 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3580 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003581 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003582 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003583 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003584 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003585 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003586
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003587 PyObject *bytes = PyBytes_FromString(str);
3588 PyMem_RawFree(str);
3589 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003590}
3591
Victor Stinnerad158722010-10-27 00:25:46 +00003592PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003593PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3594{
Victor Stinner709d23d2019-05-02 14:56:30 -04003595 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3596 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003597}
3598
3599PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003600PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003601{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003602 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003603#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003604 if (interp->fs_codec.encoding) {
3605 return unicode_encode_utf8(unicode,
3606 interp->fs_codec.error_handler,
3607 interp->fs_codec.errors);
3608 }
3609 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003610 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003611 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003612 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003613 assert(errors != _Py_ERROR_UNKNOWN);
3614 return unicode_encode_utf8(unicode, errors, NULL);
3615 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003616#else
Victor Stinner793b5312011-04-27 00:24:21 +02003617 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3618 cannot use it to encode and decode filenames before it is loaded. Load
3619 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003620 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003621 initialized and the Python codec is loaded.
3622 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003623 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003624 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003625 interp->fs_codec.encoding,
3626 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003627 }
3628 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003629 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003630 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003631 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003632 assert(errors != _Py_ERROR_UNKNOWN);
3633 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003634 }
Victor Stinnerad158722010-10-27 00:25:46 +00003635#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003636}
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638PyObject *
3639PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 const char *encoding,
3641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
3643 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003644 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 if (!PyUnicode_Check(unicode)) {
3647 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 }
Fred Drakee4315f52000-05-09 19:53:39 +00003650
Victor Stinner22eb6892019-06-26 00:51:05 +02003651 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3652 return NULL;
3653 }
3654
Victor Stinner942889a2016-09-05 15:40:10 -07003655 if (encoding == NULL) {
3656 return _PyUnicode_AsUTF8String(unicode, errors);
3657 }
3658
Fred Drakee4315f52000-05-09 19:53:39 +00003659 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003660 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3661 char *lower = buflower;
3662
3663 /* Fast paths */
3664 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3665 lower += 3;
3666 if (*lower == '_') {
3667 /* Match "utf8" and "utf_8" */
3668 lower++;
3669 }
3670
3671 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003673 }
3674 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3675 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3676 }
3677 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3678 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3679 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003680 }
Victor Stinner942889a2016-09-05 15:40:10 -07003681 else {
3682 if (strcmp(lower, "ascii") == 0
3683 || strcmp(lower, "us_ascii") == 0) {
3684 return _PyUnicode_AsASCIIString(unicode, errors);
3685 }
Steve Dowercc16be82016-09-08 10:35:16 -07003686#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003687 else if (strcmp(lower, "mbcs") == 0) {
3688 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3689 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003690#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003691 else if (strcmp(lower, "latin1") == 0 ||
3692 strcmp(lower, "latin_1") == 0 ||
3693 strcmp(lower, "iso_8859_1") == 0 ||
3694 strcmp(lower, "iso8859_1") == 0) {
3695 return _PyUnicode_AsLatin1String(unicode, errors);
3696 }
3697 }
Victor Stinner37296e82010-06-10 13:36:23 +00003698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
3700 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003701 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003703 return NULL;
3704
3705 /* The normal path */
3706 if (PyBytes_Check(v))
3707 return v;
3708
3709 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003710 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003711 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003712 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003713
3714 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003715 "encoder %s returned bytearray instead of bytes; "
3716 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003717 encoding);
3718 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003719 Py_DECREF(v);
3720 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003721 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003722
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003723 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3724 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003725 Py_DECREF(v);
3726 return b;
3727 }
3728
3729 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003730 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003731 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003732 encoding,
3733 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003734 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003735 return NULL;
3736}
3737
Alexander Belopolsky40018472011-02-26 01:02:56 +00003738PyObject *
3739PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003740 const char *encoding,
3741 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003742{
3743 PyObject *v;
3744
3745 if (!PyUnicode_Check(unicode)) {
3746 PyErr_BadArgument();
3747 goto onError;
3748 }
3749
Serhiy Storchaka00939072016-10-27 21:05:49 +03003750 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3751 "PyUnicode_AsEncodedUnicode() is deprecated; "
3752 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3753 return NULL;
3754
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003755 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003757
3758 /* Encode via the codec registry */
3759 v = PyCodec_Encode(unicode, encoding, errors);
3760 if (v == NULL)
3761 goto onError;
3762 if (!PyUnicode_Check(v)) {
3763 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003764 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003765 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003766 encoding,
3767 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003768 Py_DECREF(v);
3769 goto onError;
3770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003772
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 return NULL;
3775}
3776
Victor Stinner2cba6b82018-01-10 22:46:15 +01003777static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003778unicode_decode_locale(const char *str, Py_ssize_t len,
3779 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003780{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003781 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3782 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003783 return NULL;
3784 }
3785
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003786 wchar_t *wstr;
3787 size_t wlen;
3788 const char *reason;
3789 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003790 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003791 if (res != 0) {
3792 if (res == -2) {
3793 PyObject *exc;
3794 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3795 "locale", str, len,
3796 (Py_ssize_t)wlen,
3797 (Py_ssize_t)(wlen + 1),
3798 reason);
3799 if (exc != NULL) {
3800 PyCodec_StrictErrors(exc);
3801 Py_DECREF(exc);
3802 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003803 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003804 else if (res == -3) {
3805 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3806 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003807 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003808 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003809 }
Victor Stinner2f197072011-12-17 07:08:30 +01003810 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003811 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003812
3813 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3814 PyMem_RawFree(wstr);
3815 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003816}
3817
3818PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003819PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3820 const char *errors)
3821{
Victor Stinner709d23d2019-05-02 14:56:30 -04003822 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3823 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003824}
3825
3826PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003827PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003828{
3829 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003830 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3831 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003832}
3833
3834
3835PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003836PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003838 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3839}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003840
Christian Heimes5894ba72007-11-04 11:43:14 +00003841PyObject*
3842PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3843{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003844 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003845#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003846 if (interp->fs_codec.encoding) {
3847 return unicode_decode_utf8(s, size,
3848 interp->fs_codec.error_handler,
3849 interp->fs_codec.errors,
3850 NULL);
3851 }
3852 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003853 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003855 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003856 assert(errors != _Py_ERROR_UNKNOWN);
3857 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3858 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003859#else
Victor Stinner793b5312011-04-27 00:24:21 +02003860 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3861 cannot use it to encode and decode filenames before it is loaded. Load
3862 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003863 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003864 initialized and the Python codec is loaded.
3865 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003866 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003867 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003868 interp->fs_codec.encoding,
3869 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003870 }
3871 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003872 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003873 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003874 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003875 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003876 }
Victor Stinnerad158722010-10-27 00:25:46 +00003877#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003878}
3879
Martin v. Löwis011e8422009-05-05 04:43:17 +00003880
3881int
3882PyUnicode_FSConverter(PyObject* arg, void* addr)
3883{
Brett Cannonec6ce872016-09-06 15:50:29 -07003884 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003885 PyObject *output = NULL;
3886 Py_ssize_t size;
3887 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003888 if (arg == NULL) {
3889 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003890 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 return 1;
3892 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003893 path = PyOS_FSPath(arg);
3894 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003895 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003897 if (PyBytes_Check(path)) {
3898 output = path;
3899 }
3900 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3901 output = PyUnicode_EncodeFSDefault(path);
3902 Py_DECREF(path);
3903 if (!output) {
3904 return 0;
3905 }
3906 assert(PyBytes_Check(output));
3907 }
3908
Victor Stinner0ea2a462010-04-30 00:22:08 +00003909 size = PyBytes_GET_SIZE(output);
3910 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003911 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003912 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003913 Py_DECREF(output);
3914 return 0;
3915 }
3916 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003917 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003918}
3919
3920
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003921int
3922PyUnicode_FSDecoder(PyObject* arg, void* addr)
3923{
Brett Cannona5711202016-09-06 19:36:01 -07003924 int is_buffer = 0;
3925 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 if (arg == NULL) {
3928 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003929 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003930 return 1;
3931 }
Brett Cannona5711202016-09-06 19:36:01 -07003932
3933 is_buffer = PyObject_CheckBuffer(arg);
3934 if (!is_buffer) {
3935 path = PyOS_FSPath(arg);
3936 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003937 return 0;
3938 }
Brett Cannona5711202016-09-06 19:36:01 -07003939 }
3940 else {
3941 path = arg;
3942 Py_INCREF(arg);
3943 }
3944
3945 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003946 output = path;
3947 }
3948 else if (PyBytes_Check(path) || is_buffer) {
3949 PyObject *path_bytes = NULL;
3950
3951 if (!PyBytes_Check(path) &&
3952 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003953 "path should be string, bytes, or os.PathLike, not %.200s",
3954 Py_TYPE(arg)->tp_name)) {
3955 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003956 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003957 }
3958 path_bytes = PyBytes_FromObject(path);
3959 Py_DECREF(path);
3960 if (!path_bytes) {
3961 return 0;
3962 }
3963 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3964 PyBytes_GET_SIZE(path_bytes));
3965 Py_DECREF(path_bytes);
3966 if (!output) {
3967 return 0;
3968 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003969 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003970 else {
3971 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003972 "path should be string, bytes, or os.PathLike, not %.200s",
3973 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003974 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003975 return 0;
3976 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003977 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003978 Py_DECREF(output);
3979 return 0;
3980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003982 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003983 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003984 Py_DECREF(output);
3985 return 0;
3986 }
3987 *(PyObject**)addr = output;
3988 return Py_CLEANUP_SUPPORTED;
3989}
3990
3991
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003992const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003994{
Christian Heimesf3863112007-11-22 07:46:41 +00003995 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 return NULL;
4000 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004002 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004004 if (PyUnicode_UTF8(unicode) == NULL) {
4005 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004006 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 if (bytes == NULL)
4008 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4010 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004011 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 Py_DECREF(bytes);
4013 return NULL;
4014 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004016 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 PyBytes_AS_STRING(bytes),
4018 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_DECREF(bytes);
4020 }
4021
4022 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004023 *psize = PyUnicode_UTF8_LENGTH(unicode);
4024 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004025}
4026
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004027const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4031}
4032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033Py_UNICODE *
4034PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004040 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4041 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004043 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004044 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045
Serhiy Storchakac46db922018-10-23 22:58:24 +03004046 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4047 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4048 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004051 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4052 if (w == NULL) {
4053 PyErr_NoMemory();
4054 return NULL;
4055 }
4056 unicode_copy_as_widechar(unicode, w, wlen + 1);
4057 _PyUnicode_WSTR(unicode) = w;
4058 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4059 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 }
4061 }
4062 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004064 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004065}
4066
Alexander Belopolsky40018472011-02-26 01:02:56 +00004067Py_UNICODE *
4068PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071}
4072
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004073const Py_UNICODE *
4074_PyUnicode_AsUnicode(PyObject *unicode)
4075{
4076 Py_ssize_t size;
4077 const Py_UNICODE *wstr;
4078
4079 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4080 if (wstr && wcslen(wstr) != (size_t)size) {
4081 PyErr_SetString(PyExc_ValueError, "embedded null character");
4082 return NULL;
4083 }
4084 return wstr;
4085}
4086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087
Alexander Belopolsky40018472011-02-26 01:02:56 +00004088Py_ssize_t
4089PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090{
4091 if (!PyUnicode_Check(unicode)) {
4092 PyErr_BadArgument();
4093 goto onError;
4094 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004095 if (_PyUnicode_WSTR(unicode) == NULL) {
4096 if (PyUnicode_AsUnicode(unicode) == NULL)
4097 goto onError;
4098 }
4099 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 return -1;
4103}
4104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105Py_ssize_t
4106PyUnicode_GetLength(PyObject *unicode)
4107{
Victor Stinner07621332012-06-16 04:53:46 +02004108 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 PyErr_BadArgument();
4110 return -1;
4111 }
Victor Stinner07621332012-06-16 04:53:46 +02004112 if (PyUnicode_READY(unicode) == -1)
4113 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 return PyUnicode_GET_LENGTH(unicode);
4115}
4116
4117Py_UCS4
4118PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4119{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004120 void *data;
4121 int kind;
4122
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004123 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004124 PyErr_BadArgument();
4125 return (Py_UCS4)-1;
4126 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004127 if (PyUnicode_READY(unicode) == -1) {
4128 return (Py_UCS4)-1;
4129 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004130 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004131 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 return (Py_UCS4)-1;
4133 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004134 data = PyUnicode_DATA(unicode);
4135 kind = PyUnicode_KIND(unicode);
4136 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137}
4138
4139int
4140PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4141{
4142 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004143 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 return -1;
4145 }
Victor Stinner488fa492011-12-12 00:01:39 +01004146 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004147 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004148 PyErr_SetString(PyExc_IndexError, "string index out of range");
4149 return -1;
4150 }
Victor Stinner488fa492011-12-12 00:01:39 +01004151 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004152 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004153 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4154 PyErr_SetString(PyExc_ValueError, "character out of range");
4155 return -1;
4156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4158 index, ch);
4159 return 0;
4160}
4161
Alexander Belopolsky40018472011-02-26 01:02:56 +00004162const char *
4163PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004164{
Victor Stinner42cb4622010-09-01 19:39:01 +00004165 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004166}
4167
Victor Stinner554f3f02010-06-16 23:33:54 +00004168/* create or adjust a UnicodeDecodeError */
4169static void
4170make_decode_exception(PyObject **exceptionObject,
4171 const char *encoding,
4172 const char *input, Py_ssize_t length,
4173 Py_ssize_t startpos, Py_ssize_t endpos,
4174 const char *reason)
4175{
4176 if (*exceptionObject == NULL) {
4177 *exceptionObject = PyUnicodeDecodeError_Create(
4178 encoding, input, length, startpos, endpos, reason);
4179 }
4180 else {
4181 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4182 goto onError;
4183 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4186 goto onError;
4187 }
4188 return;
4189
4190onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004191 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004192}
4193
Steve Dowercc16be82016-09-08 10:35:16 -07004194#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004195static int
4196widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4197{
4198 if (newsize > *size) {
4199 wchar_t *newbuf = *buf;
4200 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4201 PyErr_NoMemory();
4202 return -1;
4203 }
4204 *buf = newbuf;
4205 }
4206 *size = newsize;
4207 return 0;
4208}
4209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210/* error handling callback helper:
4211 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004212 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 and adjust various state variables.
4214 return 0 on success, -1 on error
4215*/
4216
Alexander Belopolsky40018472011-02-26 01:02:56 +00004217static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218unicode_decode_call_errorhandler_wchar(
4219 const char *errors, PyObject **errorHandler,
4220 const char *encoding, const char *reason,
4221 const char **input, const char **inend, Py_ssize_t *startinpos,
4222 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004223 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004225 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226
4227 PyObject *restuple = NULL;
4228 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004229 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004230 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004231 Py_ssize_t requiredsize;
4232 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004233 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 wchar_t *repwstr;
4235 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236
4237 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 *errorHandler = PyCodec_LookupError(errors);
4239 if (*errorHandler == NULL)
4240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 }
4242
Victor Stinner554f3f02010-06-16 23:33:54 +00004243 make_decode_exception(exceptionObject,
4244 encoding,
4245 *input, *inend - *input,
4246 *startinpos, *endinpos,
4247 reason);
4248 if (*exceptionObject == NULL)
4249 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004251 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004255 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004258 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004260
4261 /* Copy back the bytes variables, which might have been modified by the
4262 callback */
4263 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4264 if (!inputobj)
4265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266 *input = PyBytes_AS_STRING(inputobj);
4267 insize = PyBytes_GET_SIZE(inputobj);
4268 *inend = *input + insize;
4269 /* we can DECREF safely, as the exception has another reference,
4270 so the object won't go away. */
4271 Py_DECREF(inputobj);
4272
4273 if (newpos<0)
4274 newpos = insize+newpos;
4275 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004276 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277 goto onError;
4278 }
4279
4280 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4281 if (repwstr == NULL)
4282 goto onError;
4283 /* need more space? (at least enough for what we
4284 have+the replacement+the rest of the string (starting
4285 at the new input position), so we won't have to check space
4286 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004287 requiredsize = *outpos;
4288 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4289 goto overflow;
4290 requiredsize += repwlen;
4291 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4292 goto overflow;
4293 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004294 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004296 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004298 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004300 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004302 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 *endinpos = newpos;
4305 *inptr = *input + newpos;
4306
4307 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004308 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 return 0;
4310
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004311 overflow:
4312 PyErr_SetString(PyExc_OverflowError,
4313 "decoded result is too long for a Python string");
4314
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 onError:
4316 Py_XDECREF(restuple);
4317 return -1;
4318}
Steve Dowercc16be82016-09-08 10:35:16 -07004319#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320
4321static int
4322unicode_decode_call_errorhandler_writer(
4323 const char *errors, PyObject **errorHandler,
4324 const char *encoding, const char *reason,
4325 const char **input, const char **inend, Py_ssize_t *startinpos,
4326 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4327 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4328{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004329 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330
4331 PyObject *restuple = NULL;
4332 PyObject *repunicode = NULL;
4333 Py_ssize_t insize;
4334 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004335 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004336 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004338 int need_to_grow = 0;
4339 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340
4341 if (*errorHandler == NULL) {
4342 *errorHandler = PyCodec_LookupError(errors);
4343 if (*errorHandler == NULL)
4344 goto onError;
4345 }
4346
4347 make_decode_exception(exceptionObject,
4348 encoding,
4349 *input, *inend - *input,
4350 *startinpos, *endinpos,
4351 reason);
4352 if (*exceptionObject == NULL)
4353 goto onError;
4354
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004355 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 if (restuple == NULL)
4357 goto onError;
4358 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004359 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 goto onError;
4361 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004362 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004363 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004364
4365 /* Copy back the bytes variables, which might have been modified by the
4366 callback */
4367 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4368 if (!inputobj)
4369 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004370 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004371 *input = PyBytes_AS_STRING(inputobj);
4372 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004373 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004374 /* we can DECREF safely, as the exception has another reference,
4375 so the object won't go away. */
4376 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004380 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004381 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384
Victor Stinner170ca6f2013-04-18 00:25:28 +02004385 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004386 if (replen > 1) {
4387 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004388 need_to_grow = 1;
4389 }
4390 new_inptr = *input + newpos;
4391 if (*inend - new_inptr > remain) {
4392 /* We don't know the decoding algorithm here so we make the worst
4393 assumption that one byte decodes to one unicode character.
4394 If unfortunately one byte could decode to more unicode characters,
4395 the decoder may write out-of-bound then. Is it possible for the
4396 algorithms using this function? */
4397 writer->min_length += *inend - new_inptr - remain;
4398 need_to_grow = 1;
4399 }
4400 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004401 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004402 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004403 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4404 goto onError;
4405 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004407 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004410 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004413 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004414 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419}
4420
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421/* --- UTF-7 Codec -------------------------------------------------------- */
4422
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423/* See RFC2152 for details. We encode conservatively and decode liberally. */
4424
4425/* Three simple macros defining base-64. */
4426
4427/* Is c a base-64 character? */
4428
4429#define IS_BASE64(c) \
4430 (((c) >= 'A' && (c) <= 'Z') || \
4431 ((c) >= 'a' && (c) <= 'z') || \
4432 ((c) >= '0' && (c) <= '9') || \
4433 (c) == '+' || (c) == '/')
4434
4435/* given that c is a base-64 character, what is its base-64 value? */
4436
4437#define FROM_BASE64(c) \
4438 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4439 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4440 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4441 (c) == '+' ? 62 : 63)
4442
4443/* What is the base-64 character of the bottom 6 bits of n? */
4444
4445#define TO_BASE64(n) \
4446 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4447
4448/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4449 * decoded as itself. We are permissive on decoding; the only ASCII
4450 * byte not decoding to itself is the + which begins a base64
4451 * string. */
4452
4453#define DECODE_DIRECT(c) \
4454 ((c) <= 127 && (c) != '+')
4455
4456/* The UTF-7 encoder treats ASCII characters differently according to
4457 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4458 * the above). See RFC2152. This array identifies these different
4459 * sets:
4460 * 0 : "Set D"
4461 * alphanumeric and '(),-./:?
4462 * 1 : "Set O"
4463 * !"#$%&*;<=>@[]^_`{|}
4464 * 2 : "whitespace"
4465 * ht nl cr sp
4466 * 3 : special (must be base64 encoded)
4467 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4468 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469
Tim Petersced69f82003-09-16 20:30:58 +00004470static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471char utf7_category[128] = {
4472/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4473 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4474/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4475 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4476/* sp ! " # $ % & ' ( ) * + , - . / */
4477 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4478/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4480/* @ A B C D E F G H I J K L M N O */
4481 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4482/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4484/* ` a b c d e f g h i j k l m n o */
4485 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4486/* p q r s t u v w x y z { | } ~ del */
4487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488};
4489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490/* ENCODE_DIRECT: this character should be encoded as itself. The
4491 * answer depends on whether we are encoding set O as itself, and also
4492 * on whether we are encoding whitespace as itself. RFC2152 makes it
4493 * clear that the answers to these questions vary between
4494 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004495
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496#define ENCODE_DIRECT(c, directO, directWS) \
4497 ((c) < 128 && (c) > 0 && \
4498 ((utf7_category[(c)] == 0) || \
4499 (directWS && (utf7_category[(c)] == 2)) || \
4500 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502PyObject *
4503PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004504 Py_ssize_t size,
4505 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004507 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4508}
4509
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510/* The decoder. The only state we preserve is our read position,
4511 * i.e. how many characters we have consumed. So if we end in the
4512 * middle of a shift sequence we have to back off the read position
4513 * and the output to the beginning of the sequence, otherwise we lose
4514 * all the shift state (seen bits, number of bits seen, high
4515 * surrogate). */
4516
Alexander Belopolsky40018472011-02-26 01:02:56 +00004517PyObject *
4518PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004519 Py_ssize_t size,
4520 const char *errors,
4521 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004524 Py_ssize_t startinpos;
4525 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 const char *errmsg = "";
4529 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004530 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 unsigned int base64bits = 0;
4532 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004533 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 PyObject *errorHandler = NULL;
4535 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537 if (size == 0) {
4538 if (consumed)
4539 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004540 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004544 _PyUnicodeWriter_Init(&writer);
4545 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546
4547 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 e = s + size;
4549
4550 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004551 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004553 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 if (inShift) { /* in a base-64 section */
4556 if (IS_BASE64(ch)) { /* consume a base-64 character */
4557 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4558 base64bits += 6;
4559 s++;
4560 if (base64bits >= 16) {
4561 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004562 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 base64bits -= 16;
4564 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004565 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (surrogate) {
4567 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004568 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4569 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004570 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004571 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004573 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 }
4575 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004576 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004577 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
4580 }
Victor Stinner551ac952011-11-29 22:58:13 +01004581 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* first surrogate */
4583 surrogate = outCh;
4584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004586 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004587 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 }
4589 }
4590 }
4591 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (base64bits > 0) { /* left-over bits */
4594 if (base64bits >= 6) {
4595 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 errmsg = "partial character in shift sequence";
4598 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 else {
4601 /* Some bits remain; they should be zero */
4602 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004603 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 errmsg = "non-zero padding bits in shift sequence";
4605 goto utf7Error;
4606 }
4607 }
4608 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004609 if (surrogate && DECODE_DIRECT(ch)) {
4610 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4611 goto onError;
4612 }
4613 surrogate = 0;
4614 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 /* '-' is absorbed; other terminating
4616 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004617 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 }
4620 }
4621 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 s++; /* consume '+' */
4624 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004626 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004627 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004629 else if (s < e && !IS_BASE64(*s)) {
4630 s++;
4631 errmsg = "ill-formed sequence";
4632 goto utf7Error;
4633 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004637 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004639 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 }
4641 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004645 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 else {
4648 startinpos = s-starts;
4649 s++;
4650 errmsg = "unexpected special character";
4651 goto utf7Error;
4652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004656 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 errors, &errorHandler,
4658 "utf7", errmsg,
4659 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004660 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
4663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* end of string */
4665
4666 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4667 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004668 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 if (surrogate ||
4670 (base64bits >= 6) ||
4671 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 errors, &errorHandler,
4675 "utf7", "unterminated shift sequence",
4676 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004677 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 goto onError;
4679 if (s < e)
4680 goto restart;
4681 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683
4684 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004685 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004688 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004689 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004690 writer.kind, writer.data, shiftOutStart);
4691 Py_XDECREF(errorHandler);
4692 Py_XDECREF(exc);
4693 _PyUnicodeWriter_Dealloc(&writer);
4694 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004695 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004696 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
4698 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004699 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 Py_XDECREF(errorHandler);
4704 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004705 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 Py_XDECREF(errorHandler);
4709 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 return NULL;
4712}
4713
4714
Alexander Belopolsky40018472011-02-26 01:02:56 +00004715PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716_PyUnicode_EncodeUTF7(PyObject *str,
4717 int base64SetO,
4718 int base64WhiteSpace,
4719 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004721 int kind;
4722 void *data;
4723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004724 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004726 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 unsigned int base64bits = 0;
4728 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729 char * out;
4730 char * start;
4731
Benjamin Petersonbac79492012-01-14 13:34:47 -05004732 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004733 return NULL;
4734 kind = PyUnicode_KIND(str);
4735 data = PyUnicode_DATA(str);
4736 len = PyUnicode_GET_LENGTH(str);
4737
4738 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004742 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004743 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004744 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 if (v == NULL)
4746 return NULL;
4747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004749 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004750 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 if (inShift) {
4753 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4754 /* shifting out */
4755 if (base64bits) { /* output remaining bits */
4756 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4757 base64buffer = 0;
4758 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 }
4760 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 /* Characters not in the BASE64 set implicitly unshift the sequence
4762 so no '-' is required, except if the character is itself a '-' */
4763 if (IS_BASE64(ch) || ch == '-') {
4764 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 *out++ = (char) ch;
4767 }
4768 else {
4769 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004770 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 else { /* not in a shift sequence */
4773 if (ch == '+') {
4774 *out++ = '+';
4775 *out++ = '-';
4776 }
4777 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4778 *out++ = (char) ch;
4779 }
4780 else {
4781 *out++ = '+';
4782 inShift = 1;
4783 goto encode_char;
4784 }
4785 }
4786 continue;
4787encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004789 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004790
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 /* code first surrogate */
4792 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004793 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 while (base64bits >= 6) {
4795 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4796 base64bits -= 6;
4797 }
4798 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004799 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 base64bits += 16;
4802 base64buffer = (base64buffer << 16) | ch;
4803 while (base64bits >= 6) {
4804 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4805 base64bits -= 6;
4806 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 if (base64bits)
4809 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4810 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004812 if (_PyBytes_Resize(&v, out - start) < 0)
4813 return NULL;
4814 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004815}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004816PyObject *
4817PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4818 Py_ssize_t size,
4819 int base64SetO,
4820 int base64WhiteSpace,
4821 const char *errors)
4822{
4823 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004824 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004825 if (tmp == NULL)
4826 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004827 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004828 base64WhiteSpace, errors);
4829 Py_DECREF(tmp);
4830 return result;
4831}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004832
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833#undef IS_BASE64
4834#undef FROM_BASE64
4835#undef TO_BASE64
4836#undef DECODE_DIRECT
4837#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004838
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839/* --- UTF-8 Codec -------------------------------------------------------- */
4840
Alexander Belopolsky40018472011-02-26 01:02:56 +00004841PyObject *
4842PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004843 Py_ssize_t size,
4844 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Walter Dörwald69652032004-09-07 20:24:22 +00004846 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4847}
4848
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#include "stringlib/asciilib.h"
4850#include "stringlib/codecs.h"
4851#include "stringlib/undef.h"
4852
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004853#include "stringlib/ucs1lib.h"
4854#include "stringlib/codecs.h"
4855#include "stringlib/undef.h"
4856
4857#include "stringlib/ucs2lib.h"
4858#include "stringlib/codecs.h"
4859#include "stringlib/undef.h"
4860
4861#include "stringlib/ucs4lib.h"
4862#include "stringlib/codecs.h"
4863#include "stringlib/undef.h"
4864
Antoine Pitrouab868312009-01-10 15:40:25 +00004865/* Mask to quickly check whether a C 'long' contains a
4866 non-ASCII, UTF8-encoded char. */
4867#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004868# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004869#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004870# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004871#else
4872# error C 'long' size should be either 4 or 8!
4873#endif
4874
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875static Py_ssize_t
4876ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004879 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004881 /*
4882 * Issue #17237: m68k is a bit different from most architectures in
4883 * that objects do not use "natural alignment" - for example, int and
4884 * long are only aligned at 2-byte boundaries. Therefore the assert()
4885 * won't work; also, tests have shown that skipping the "optimised
4886 * version" will even speed up m68k.
4887 */
4888#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004890 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4891 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 /* Fast path, see in STRINGLIB(utf8_decode) for
4893 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004894 /* Help allocation */
4895 const char *_p = p;
4896 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897 while (_p < aligned_end) {
4898 unsigned long value = *(const unsigned long *) _p;
4899 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 *((unsigned long *)q) = value;
4902 _p += SIZEOF_LONG;
4903 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004904 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 p = _p;
4906 while (p < end) {
4907 if ((unsigned char)*p & 0x80)
4908 break;
4909 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004914#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 while (p < end) {
4916 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4917 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004918 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004919 /* Help allocation */
4920 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 while (_p < aligned_end) {
4922 unsigned long value = *(unsigned long *) _p;
4923 if (value & ASCII_CHAR_MASK)
4924 break;
4925 _p += SIZEOF_LONG;
4926 }
4927 p = _p;
4928 if (_p == end)
4929 break;
4930 }
4931 if ((unsigned char)*p & 0x80)
4932 break;
4933 ++p;
4934 }
4935 memcpy(dest, start, p - start);
4936 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937}
Antoine Pitrouab868312009-01-10 15:40:25 +00004938
Victor Stinner709d23d2019-05-02 14:56:30 -04004939static PyObject *
4940unicode_decode_utf8(const char *s, Py_ssize_t size,
4941 _Py_error_handler error_handler, const char *errors,
4942 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004943{
Victor Stinner785938e2011-12-11 20:09:03 +01004944 if (size == 0) {
4945 if (consumed)
4946 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004947 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004948 }
4949
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4951 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004952 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953 *consumed = 1;
4954 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004955 }
4956
Inada Naoki770847a2019-06-24 12:30:24 +09004957 const char *starts = s;
4958 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004959
Inada Naoki770847a2019-06-24 12:30:24 +09004960 // fast path: try ASCII string.
4961 PyObject *u = PyUnicode_New(size, 127);
4962 if (u == NULL) {
4963 return NULL;
4964 }
4965 s += ascii_decode(s, end, PyUnicode_DATA(u));
4966 if (s == end) {
4967 return u;
4968 }
4969
4970 // Use _PyUnicodeWriter after fast path is failed.
4971 _PyUnicodeWriter writer;
4972 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4973 writer.pos = s - starts;
4974
4975 Py_ssize_t startinpos, endinpos;
4976 const char *errmsg = "";
4977 PyObject *error_handler_obj = NULL;
4978 PyObject *exc = NULL;
4979
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 while (s < end) {
4981 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 if (PyUnicode_IS_ASCII(writer.buffer))
4986 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 } else {
4992 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 }
4995
4996 switch (ch) {
4997 case 0:
4998 if (s == end || consumed)
4999 goto End;
5000 errmsg = "unexpected end of data";
5001 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005002 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 break;
5004 case 1:
5005 errmsg = "invalid start byte";
5006 startinpos = s - starts;
5007 endinpos = startinpos + 1;
5008 break;
5009 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005010 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5011 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5012 {
5013 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005014 goto End;
5015 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005016 /* fall through */
5017 case 3:
5018 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 errmsg = "invalid continuation byte";
5020 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005021 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 break;
5023 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005024 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 goto onError;
5026 continue;
5027 }
5028
Victor Stinner1d65d912015-10-05 13:43:50 +02005029 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005030 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005031
5032 switch (error_handler) {
5033 case _Py_ERROR_IGNORE:
5034 s += (endinpos - startinpos);
5035 break;
5036
5037 case _Py_ERROR_REPLACE:
5038 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5039 goto onError;
5040 s += (endinpos - startinpos);
5041 break;
5042
5043 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005044 {
5045 Py_ssize_t i;
5046
Victor Stinner1d65d912015-10-05 13:43:50 +02005047 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5048 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005049 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005050 ch = (Py_UCS4)(unsigned char)(starts[i]);
5051 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5052 ch + 0xdc00);
5053 writer.pos++;
5054 }
5055 s += (endinpos - startinpos);
5056 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005057 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005058
5059 default:
5060 if (unicode_decode_call_errorhandler_writer(
5061 errors, &error_handler_obj,
5062 "utf-8", errmsg,
5063 &starts, &end, &startinpos, &endinpos, &exc, &s,
5064 &writer))
5065 goto onError;
5066 }
Victor Stinner785938e2011-12-11 20:09:03 +01005067 }
5068
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070 if (consumed)
5071 *consumed = s - starts;
5072
Victor Stinner1d65d912015-10-05 13:43:50 +02005073 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005075 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076
5077onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005078 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005082}
5083
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084
Victor Stinner709d23d2019-05-02 14:56:30 -04005085PyObject *
5086PyUnicode_DecodeUTF8Stateful(const char *s,
5087 Py_ssize_t size,
5088 const char *errors,
5089 Py_ssize_t *consumed)
5090{
5091 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5092}
5093
5094
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005095/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5096 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005097
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005098 On success, write a pointer to a newly allocated wide character string into
5099 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5100 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102 On memory allocation failure, return -1.
5103
5104 On decoding error (if surrogateescape is zero), return -2. If wlen is
5105 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5106 is not NULL, write the decoding error message into *reason. */
5107int
5108_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005109 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005111 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 wchar_t *unicode;
5114 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115
Victor Stinner3d4226a2018-08-29 22:21:32 +02005116 int surrogateescape = 0;
5117 int surrogatepass = 0;
5118 switch (errors)
5119 {
5120 case _Py_ERROR_STRICT:
5121 break;
5122 case _Py_ERROR_SURROGATEESCAPE:
5123 surrogateescape = 1;
5124 break;
5125 case _Py_ERROR_SURROGATEPASS:
5126 surrogatepass = 1;
5127 break;
5128 default:
5129 return -3;
5130 }
5131
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 /* Note: size will always be longer than the resulting Unicode
5133 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005134 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005135 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 }
5137
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005138 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005139 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005140 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005141 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142
5143 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 if (ch > 0xFF) {
5154#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005155 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005157 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005158 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5160 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5161#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005162 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005164 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005166 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005167
5168 if (surrogateescape) {
5169 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5170 }
5171 else {
5172 /* Is it a valid three-byte code? */
5173 if (surrogatepass
5174 && (e - s) >= 3
5175 && (s[0] & 0xf0) == 0xe0
5176 && (s[1] & 0xc0) == 0x80
5177 && (s[2] & 0xc0) == 0x80)
5178 {
5179 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5180 s += 3;
5181 unicode[outpos++] = ch;
5182 }
5183 else {
5184 PyMem_RawFree(unicode );
5185 if (reason != NULL) {
5186 switch (ch) {
5187 case 0:
5188 *reason = "unexpected end of data";
5189 break;
5190 case 1:
5191 *reason = "invalid start byte";
5192 break;
5193 /* 2, 3, 4 */
5194 default:
5195 *reason = "invalid continuation byte";
5196 break;
5197 }
5198 }
5199 if (wlen != NULL) {
5200 *wlen = s - orig_s;
5201 }
5202 return -2;
5203 }
5204 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005206 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005208 if (wlen) {
5209 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005210 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005211 *wstr = unicode;
5212 return 0;
5213}
5214
Victor Stinner5f9cf232019-03-19 01:46:25 +01005215
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005216wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005217_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5218 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219{
5220 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005221 int res = _Py_DecodeUTF8Ex(arg, arglen,
5222 &wstr, wlen,
5223 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005224 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005225 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5226 assert(res != -3);
5227 if (wlen) {
5228 *wlen = (size_t)res;
5229 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005230 return NULL;
5231 }
5232 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005233}
5234
Antoine Pitrouab868312009-01-10 15:40:25 +00005235
Victor Stinnere47e6982017-12-21 15:45:16 +01005236/* UTF-8 encoder using the surrogateescape error handler .
5237
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005238 On success, return 0 and write the newly allocated character string (use
5239 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005240
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005241 On encoding failure, return -2 and write the position of the invalid
5242 surrogate character into *error_pos (if error_pos is set) and the decoding
5243 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005244
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 On memory allocation failure, return -1. */
5246int
5247_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005248 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005249{
5250 const Py_ssize_t max_char_size = 4;
5251 Py_ssize_t len = wcslen(text);
5252
5253 assert(len >= 0);
5254
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 int surrogateescape = 0;
5256 int surrogatepass = 0;
5257 switch (errors)
5258 {
5259 case _Py_ERROR_STRICT:
5260 break;
5261 case _Py_ERROR_SURROGATEESCAPE:
5262 surrogateescape = 1;
5263 break;
5264 case _Py_ERROR_SURROGATEPASS:
5265 surrogatepass = 1;
5266 break;
5267 default:
5268 return -3;
5269 }
5270
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005271 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5272 return -1;
5273 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005274 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 if (raw_malloc) {
5276 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005277 }
5278 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005280 }
5281 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005283 }
5284
5285 char *p = bytes;
5286 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005287 for (i = 0; i < len; ) {
5288 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005290 i++;
5291#if Py_UNICODE_SIZE == 2
5292 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5293 && i < len
5294 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5295 {
5296 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5297 i++;
5298 }
5299#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005300
5301 if (ch < 0x80) {
5302 /* Encode ASCII */
5303 *p++ = (char) ch;
5304
5305 }
5306 else if (ch < 0x0800) {
5307 /* Encode Latin-1 */
5308 *p++ = (char)(0xc0 | (ch >> 6));
5309 *p++ = (char)(0x80 | (ch & 0x3f));
5310 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005311 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005312 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005313 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005315 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005316 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005317 if (reason != NULL) {
5318 *reason = "encoding error";
5319 }
5320 if (raw_malloc) {
5321 PyMem_RawFree(bytes);
5322 }
5323 else {
5324 PyMem_Free(bytes);
5325 }
5326 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005327 }
5328 *p++ = (char)(ch & 0xff);
5329 }
5330 else if (ch < 0x10000) {
5331 *p++ = (char)(0xe0 | (ch >> 12));
5332 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5333 *p++ = (char)(0x80 | (ch & 0x3f));
5334 }
5335 else { /* ch >= 0x10000 */
5336 assert(ch <= MAX_UNICODE);
5337 /* Encode UCS4 Unicode ordinals */
5338 *p++ = (char)(0xf0 | (ch >> 18));
5339 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5340 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5341 *p++ = (char)(0x80 | (ch & 0x3f));
5342 }
5343 }
5344 *p++ = '\0';
5345
5346 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005347 char *bytes2;
5348 if (raw_malloc) {
5349 bytes2 = PyMem_RawRealloc(bytes, final_size);
5350 }
5351 else {
5352 bytes2 = PyMem_Realloc(bytes, final_size);
5353 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005354 if (bytes2 == NULL) {
5355 if (error_pos != NULL) {
5356 *error_pos = (size_t)-1;
5357 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005358 if (raw_malloc) {
5359 PyMem_RawFree(bytes);
5360 }
5361 else {
5362 PyMem_Free(bytes);
5363 }
5364 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005365 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005366 *str = bytes2;
5367 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005368}
5369
5370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371/* Primary internal function which creates utf8 encoded bytes objects.
5372
5373 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005374 and allocate exactly as much space needed at the end. Else allocate the
5375 maximum possible needed (4 result bytes per Unicode character), and return
5376 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005377*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005378static PyObject *
5379unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381{
Victor Stinner6099a032011-12-18 14:22:26 +01005382 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 void *data;
5384 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 if (!PyUnicode_Check(unicode)) {
5387 PyErr_BadArgument();
5388 return NULL;
5389 }
5390
5391 if (PyUnicode_READY(unicode) == -1)
5392 return NULL;
5393
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005394 if (PyUnicode_UTF8(unicode))
5395 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5396 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397
5398 kind = PyUnicode_KIND(unicode);
5399 data = PyUnicode_DATA(unicode);
5400 size = PyUnicode_GET_LENGTH(unicode);
5401
Benjamin Petersonead6b532011-12-20 17:23:42 -06005402 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005403 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005404 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005405 case PyUnicode_1BYTE_KIND:
5406 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5407 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005408 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005409 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005410 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005411 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005412 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414}
5415
Alexander Belopolsky40018472011-02-26 01:02:56 +00005416PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005417_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5418{
5419 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5420}
5421
5422
5423PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5425 Py_ssize_t size,
5426 const char *errors)
5427{
5428 PyObject *v, *unicode;
5429
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005430 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 if (unicode == NULL)
5432 return NULL;
5433 v = _PyUnicode_AsUTF8String(unicode, errors);
5434 Py_DECREF(unicode);
5435 return v;
5436}
5437
5438PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005439PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
Walter Dörwald41980ca2007-08-16 21:55:45 +00005444/* --- UTF-32 Codec ------------------------------------------------------- */
5445
5446PyObject *
5447PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 Py_ssize_t size,
5449 const char *errors,
5450 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005451{
5452 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5453}
5454
5455PyObject *
5456PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 Py_ssize_t size,
5458 const char *errors,
5459 int *byteorder,
5460 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005461{
5462 const char *starts = s;
5463 Py_ssize_t startinpos;
5464 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005465 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005466 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005467 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005469 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470 PyObject *errorHandler = NULL;
5471 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005472
Walter Dörwald41980ca2007-08-16 21:55:45 +00005473 q = (unsigned char *)s;
5474 e = q + size;
5475
5476 if (byteorder)
5477 bo = *byteorder;
5478
5479 /* Check for BOM marks (U+FEFF) in the input and adjust current
5480 byte order setting accordingly. In native mode, the leading BOM
5481 mark is skipped, in all other modes, it is copied to the output
5482 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005484 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 if (bom == 0x0000FEFF) {
5486 bo = -1;
5487 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005489 else if (bom == 0xFFFE0000) {
5490 bo = 1;
5491 q += 4;
5492 }
5493 if (byteorder)
5494 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005495 }
5496
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 if (q == e) {
5498 if (consumed)
5499 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005500 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005501 }
5502
Victor Stinnere64322e2012-10-30 23:12:47 +01005503#ifdef WORDS_BIGENDIAN
5504 le = bo < 0;
5505#else
5506 le = bo <= 0;
5507#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005509
Victor Stinner8f674cc2013-04-17 23:02:17 +02005510 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005511 writer.min_length = (e - q + 3) / 4;
5512 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005513 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005514
Victor Stinnere64322e2012-10-30 23:12:47 +01005515 while (1) {
5516 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005518
Victor Stinnere64322e2012-10-30 23:12:47 +01005519 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 enum PyUnicode_Kind kind = writer.kind;
5521 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005522 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005523 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005524 if (le) {
5525 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005526 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005527 if (ch > maxch)
5528 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005529 if (kind != PyUnicode_1BYTE_KIND &&
5530 Py_UNICODE_IS_SURROGATE(ch))
5531 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 q += 4;
5534 } while (q <= last);
5535 }
5536 else {
5537 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005538 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 if (ch > maxch)
5540 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005541 if (kind != PyUnicode_1BYTE_KIND &&
5542 Py_UNICODE_IS_SURROGATE(ch))
5543 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005544 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005545 q += 4;
5546 } while (q <= last);
5547 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005548 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005549 }
5550
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005551 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005552 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 startinpos = ((const char *)q) - starts;
5554 endinpos = startinpos + 4;
5555 }
5556 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005557 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005559 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005561 startinpos = ((const char *)q) - starts;
5562 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005564 else {
5565 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005566 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005567 goto onError;
5568 q += 4;
5569 continue;
5570 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005571 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 startinpos = ((const char *)q) - starts;
5573 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005575
5576 /* The remaining input chars are ignored if the callback
5577 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005578 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005582 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005584 }
5585
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588
Walter Dörwald41980ca2007-08-16 21:55:45 +00005589 Py_XDECREF(errorHandler);
5590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005594 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 Py_XDECREF(errorHandler);
5596 Py_XDECREF(exc);
5597 return NULL;
5598}
5599
5600PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005601_PyUnicode_EncodeUTF32(PyObject *str,
5602 const char *errors,
5603 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005604{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005605 enum PyUnicode_Kind kind;
5606 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005607 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005608 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005609 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005610#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005612#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005613 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005615 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005616 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 PyObject *errorHandler = NULL;
5618 PyObject *exc = NULL;
5619 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005620
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 if (!PyUnicode_Check(str)) {
5622 PyErr_BadArgument();
5623 return NULL;
5624 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005625 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005626 return NULL;
5627 kind = PyUnicode_KIND(str);
5628 data = PyUnicode_DATA(str);
5629 len = PyUnicode_GET_LENGTH(str);
5630
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005631 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005632 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005633 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005634 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005635 if (v == NULL)
5636 return NULL;
5637
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005638 /* output buffer is 4-bytes aligned */
5639 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005640 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005641 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005642 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005645
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 else
5651 encoding = "utf-32";
5652
5653 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005654 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5655 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005656 }
5657
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 pos = 0;
5659 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661
5662 if (kind == PyUnicode_2BYTE_KIND) {
5663 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5664 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005666 else {
5667 assert(kind == PyUnicode_4BYTE_KIND);
5668 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5669 &out, native_ordering);
5670 }
5671 if (pos == len)
5672 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005673
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 rep = unicode_encode_call_errorhandler(
5675 errors, &errorHandler,
5676 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005677 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005678 if (!rep)
5679 goto error;
5680
5681 if (PyBytes_Check(rep)) {
5682 repsize = PyBytes_GET_SIZE(rep);
5683 if (repsize & 3) {
5684 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005685 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005686 "surrogates not allowed");
5687 goto error;
5688 }
5689 moreunits = repsize / 4;
5690 }
5691 else {
5692 assert(PyUnicode_Check(rep));
5693 if (PyUnicode_READY(rep) < 0)
5694 goto error;
5695 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5696 if (!PyUnicode_IS_ASCII(rep)) {
5697 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005699 "surrogates not allowed");
5700 goto error;
5701 }
5702 }
5703
5704 /* four bytes are reserved for each surrogate */
5705 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005706 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005707 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005708 /* integer overflow */
5709 PyErr_NoMemory();
5710 goto error;
5711 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005712 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005714 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 }
5716
5717 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005718 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005719 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005720 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005721 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005722 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5723 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 }
5725
5726 Py_CLEAR(rep);
5727 }
5728
5729 /* Cut back to size actually needed. This is necessary for, for example,
5730 encoding of a string containing isolated surrogates and the 'ignore'
5731 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733 if (nsize != PyBytes_GET_SIZE(v))
5734 _PyBytes_Resize(&v, nsize);
5735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005738 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 error:
5740 Py_XDECREF(rep);
5741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
5743 Py_XDECREF(v);
5744 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5749 Py_ssize_t size,
5750 const char *errors,
5751 int byteorder)
5752{
5753 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005754 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 if (tmp == NULL)
5756 return NULL;
5757 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5758 Py_DECREF(tmp);
5759 return result;
5760}
5761
5762PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005763PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005764{
Victor Stinnerb960b342011-11-20 19:12:52 +01005765 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005766}
5767
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768/* --- UTF-16 Codec ------------------------------------------------------- */
5769
Tim Peters772747b2001-08-09 22:21:55 +00005770PyObject *
5771PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 Py_ssize_t size,
5773 const char *errors,
5774 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775{
Walter Dörwald69652032004-09-07 20:24:22 +00005776 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5777}
5778
5779PyObject *
5780PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 Py_ssize_t size,
5782 const char *errors,
5783 int *byteorder,
5784 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 Py_ssize_t startinpos;
5788 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005791 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005792 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005793 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 PyObject *errorHandler = NULL;
5795 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Tim Peters772747b2001-08-09 22:21:55 +00005798 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005802 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005804 /* Check for BOM marks (U+FEFF) in the input and adjust current
5805 byte order setting accordingly. In native mode, the leading BOM
5806 mark is skipped, in all other modes, it is copied to the output
5807 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005808 if (bo == 0 && size >= 2) {
5809 const Py_UCS4 bom = (q[1] << 8) | q[0];
5810 if (bom == 0xFEFF) {
5811 q += 2;
5812 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005814 else if (bom == 0xFFFE) {
5815 q += 2;
5816 bo = 1;
5817 }
5818 if (byteorder)
5819 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
Antoine Pitrou63065d72012-05-15 23:48:04 +02005822 if (q == e) {
5823 if (consumed)
5824 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005825 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005826 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005827
Christian Heimes743e0cd2012-10-17 23:52:17 +02005828#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005831#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005832 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005834#endif
Tim Peters772747b2001-08-09 22:21:55 +00005835
Antoine Pitrou63065d72012-05-15 23:48:04 +02005836 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005837 character count normally. Error handler will take care of
5838 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005839 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005840 writer.min_length = (e - q + 1) / 2;
5841 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843
Antoine Pitrou63065d72012-05-15 23:48:04 +02005844 while (1) {
5845 Py_UCS4 ch = 0;
5846 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005847 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005848 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005852 native_ordering);
5853 else
5854 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005856 native_ordering);
5857 } else if (kind == PyUnicode_2BYTE_KIND) {
5858 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005860 native_ordering);
5861 } else {
5862 assert(kind == PyUnicode_4BYTE_KIND);
5863 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005865 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005866 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868
Antoine Pitrou63065d72012-05-15 23:48:04 +02005869 switch (ch)
5870 {
5871 case 0:
5872 /* remaining byte at the end? (size should be even) */
5873 if (q == e || consumed)
5874 goto End;
5875 errmsg = "truncated data";
5876 startinpos = ((const char *)q) - starts;
5877 endinpos = ((const char *)e) - starts;
5878 break;
5879 /* The remaining input chars are ignored if the callback
5880 chooses to skip the input */
5881 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005882 q -= 2;
5883 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005884 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005886 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 endinpos = ((const char *)e) - starts;
5888 break;
5889 case 2:
5890 errmsg = "illegal encoding";
5891 startinpos = ((const char *)q) - 2 - starts;
5892 endinpos = startinpos + 2;
5893 break;
5894 case 3:
5895 errmsg = "illegal UTF-16 surrogate";
5896 startinpos = ((const char *)q) - 4 - starts;
5897 endinpos = startinpos + 2;
5898 break;
5899 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005900 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 continue;
5903 }
5904
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005906 errors,
5907 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005908 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005909 &starts,
5910 (const char **)&e,
5911 &startinpos,
5912 &endinpos,
5913 &exc,
5914 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005915 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 }
5918
Antoine Pitrou63065d72012-05-15 23:48:04 +02005919End:
Walter Dörwald69652032004-09-07 20:24:22 +00005920 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 Py_XDECREF(errorHandler);
5924 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005925 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005928 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 Py_XDECREF(errorHandler);
5930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 return NULL;
5932}
5933
Tim Peters772747b2001-08-09 22:21:55 +00005934PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935_PyUnicode_EncodeUTF16(PyObject *str,
5936 const char *errors,
5937 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005939 enum PyUnicode_Kind kind;
5940 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005942 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005943 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005944 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005945#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005946 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005947#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005948 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005949#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 const char *encoding;
5951 Py_ssize_t nsize, pos;
5952 PyObject *errorHandler = NULL;
5953 PyObject *exc = NULL;
5954 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005955
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 if (!PyUnicode_Check(str)) {
5957 PyErr_BadArgument();
5958 return NULL;
5959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005960 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 return NULL;
5962 kind = PyUnicode_KIND(str);
5963 data = PyUnicode_DATA(str);
5964 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005965
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005966 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005967 if (kind == PyUnicode_4BYTE_KIND) {
5968 const Py_UCS4 *in = (const Py_UCS4 *)data;
5969 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005970 while (in < end) {
5971 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005973 }
5974 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005975 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005976 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005978 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005979 nsize = len + pairs + (byteorder == 0);
5980 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005981 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005985 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005986 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005988 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005990 }
5991 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005992 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005993 }
Tim Peters772747b2001-08-09 22:21:55 +00005994
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005995 if (kind == PyUnicode_1BYTE_KIND) {
5996 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5997 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005998 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005999
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006000 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006002 }
6003 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006004 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006005 }
6006 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006007 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006008 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009
6010 pos = 0;
6011 while (pos < len) {
6012 Py_ssize_t repsize, moreunits;
6013
6014 if (kind == PyUnicode_2BYTE_KIND) {
6015 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6016 &out, native_ordering);
6017 }
6018 else {
6019 assert(kind == PyUnicode_4BYTE_KIND);
6020 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6021 &out, native_ordering);
6022 }
6023 if (pos == len)
6024 break;
6025
6026 rep = unicode_encode_call_errorhandler(
6027 errors, &errorHandler,
6028 encoding, "surrogates not allowed",
6029 str, &exc, pos, pos + 1, &pos);
6030 if (!rep)
6031 goto error;
6032
6033 if (PyBytes_Check(rep)) {
6034 repsize = PyBytes_GET_SIZE(rep);
6035 if (repsize & 1) {
6036 raise_encode_exception(&exc, encoding,
6037 str, pos - 1, pos,
6038 "surrogates not allowed");
6039 goto error;
6040 }
6041 moreunits = repsize / 2;
6042 }
6043 else {
6044 assert(PyUnicode_Check(rep));
6045 if (PyUnicode_READY(rep) < 0)
6046 goto error;
6047 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6048 if (!PyUnicode_IS_ASCII(rep)) {
6049 raise_encode_exception(&exc, encoding,
6050 str, pos - 1, pos,
6051 "surrogates not allowed");
6052 goto error;
6053 }
6054 }
6055
6056 /* two bytes are reserved for each surrogate */
6057 if (moreunits > 1) {
6058 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006059 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006060 /* integer overflow */
6061 PyErr_NoMemory();
6062 goto error;
6063 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006064 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006065 goto error;
6066 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6067 }
6068
6069 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006070 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006071 out += moreunits;
6072 } else /* rep is unicode */ {
6073 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075 &out, native_ordering);
6076 }
6077
6078 Py_CLEAR(rep);
6079 }
6080
6081 /* Cut back to size actually needed. This is necessary for, for example,
6082 encoding of a string containing isolated surrogates and the 'ignore' handler
6083 is used. */
6084 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6085 if (nsize != PyBytes_GET_SIZE(v))
6086 _PyBytes_Resize(&v, nsize);
6087 Py_XDECREF(errorHandler);
6088 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006089 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006090 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006091 error:
6092 Py_XDECREF(rep);
6093 Py_XDECREF(errorHandler);
6094 Py_XDECREF(exc);
6095 Py_XDECREF(v);
6096 return NULL;
6097#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
Alexander Belopolsky40018472011-02-26 01:02:56 +00006100PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6102 Py_ssize_t size,
6103 const char *errors,
6104 int byteorder)
6105{
6106 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006107 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108 if (tmp == NULL)
6109 return NULL;
6110 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6111 Py_DECREF(tmp);
6112 return result;
6113}
6114
6115PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006116PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
6121/* --- Unicode Escape Codec ----------------------------------------------- */
6122
Fredrik Lundh06d12682001-01-24 07:59:11 +00006123static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006126_PyUnicode_DecodeUnicodeEscape(const char *s,
6127 Py_ssize_t size,
6128 const char *errors,
6129 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 PyObject *errorHandler = NULL;
6135 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006136
Eric V. Smith42454af2016-10-31 09:22:08 -04006137 // so we can remember if we've seen an invalid escape char or not
6138 *first_invalid_escape = NULL;
6139
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006141 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 }
6143 /* Escaped strings will always be longer than the resulting
6144 Unicode string, so we start with size here and then reduce the
6145 length after conversion to the true value.
6146 (but if the error callback returns a long replacement string
6147 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006148 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 writer.min_length = size;
6150 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6151 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006152 }
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 end = s + size;
6155 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 unsigned char c = (unsigned char) *s++;
6157 Py_UCS4 ch;
6158 int count;
6159 Py_ssize_t startinpos;
6160 Py_ssize_t endinpos;
6161 const char *message;
6162
6163#define WRITE_ASCII_CHAR(ch) \
6164 do { \
6165 assert(ch <= 127); \
6166 assert(writer.pos < writer.size); \
6167 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6168 } while(0)
6169
6170#define WRITE_CHAR(ch) \
6171 do { \
6172 if (ch <= writer.maxchar) { \
6173 assert(writer.pos < writer.size); \
6174 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6175 } \
6176 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6177 goto onError; \
6178 } \
6179 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
6181 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 if (c != '\\') {
6183 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 continue;
6185 }
6186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (s >= end) {
6190 message = "\\ at end of string";
6191 goto error;
6192 }
6193 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006194
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006196 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 case '\n': continue;
6200 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6201 case '\'': WRITE_ASCII_CHAR('\''); continue;
6202 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6203 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006204 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006205 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6206 case 't': WRITE_ASCII_CHAR('\t'); continue;
6207 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6208 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006209 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006211 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 case '0': case '1': case '2': case '3':
6216 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006218 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 ch = (ch<<3) + *s++ - '0';
6220 if (s < end && '0' <= *s && *s <= '7') {
6221 ch = (ch<<3) + *s++ - '0';
6222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 WRITE_CHAR(ch);
6225 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 /* hex escapes */
6228 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006231 message = "truncated \\xXX escape";
6232 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006237 message = "truncated \\uXXXX escape";
6238 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006241 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006243 message = "truncated \\UXXXXXXXX escape";
6244 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006246 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 ch <<= 4;
6248 if (c >= '0' && c <= '9') {
6249 ch += c - '0';
6250 }
6251 else if (c >= 'a' && c <= 'f') {
6252 ch += c - ('a' - 10);
6253 }
6254 else if (c >= 'A' && c <= 'F') {
6255 ch += c - ('A' - 10);
6256 }
6257 else {
6258 break;
6259 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006260 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006262 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 }
6264
6265 /* when we get here, ch is a 32-bit unicode character */
6266 if (ch > MAX_UNICODE) {
6267 message = "illegal Unicode character";
6268 goto error;
6269 }
6270
6271 WRITE_CHAR(ch);
6272 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006275 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006276 if (ucnhash_CAPI == NULL) {
6277 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006278 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6279 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 if (ucnhash_CAPI == NULL) {
6281 PyErr_SetString(
6282 PyExc_UnicodeError,
6283 "\\N escapes not supported (can't load unicodedata module)"
6284 );
6285 goto onError;
6286 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006287 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006288
6289 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006290 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 const char *start = ++s;
6292 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006293 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006295 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 namelen = s - start;
6297 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006298 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006299 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 ch = 0xffffffff; /* in case 'getcode' messes up */
6301 if (namelen <= INT_MAX &&
6302 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6303 &ch, 0)) {
6304 assert(ch <= MAX_UNICODE);
6305 WRITE_CHAR(ch);
6306 continue;
6307 }
6308 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006309 }
6310 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006311 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006312
6313 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006314 if (*first_invalid_escape == NULL) {
6315 *first_invalid_escape = s-1; /* Back up one char, since we've
6316 already incremented s. */
6317 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 WRITE_ASCII_CHAR('\\');
6319 WRITE_CHAR(c);
6320 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006322
6323 error:
6324 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006326 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006327 errors, &errorHandler,
6328 "unicodeescape", message,
6329 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006331 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006333 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006334
6335#undef WRITE_ASCII_CHAR
6336#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006338
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006339 Py_XDECREF(errorHandler);
6340 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006341 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 return NULL;
6348}
6349
Eric V. Smith42454af2016-10-31 09:22:08 -04006350PyObject *
6351PyUnicode_DecodeUnicodeEscape(const char *s,
6352 Py_ssize_t size,
6353 const char *errors)
6354{
6355 const char *first_invalid_escape;
6356 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6357 &first_invalid_escape);
6358 if (result == NULL)
6359 return NULL;
6360 if (first_invalid_escape != NULL) {
6361 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6362 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006363 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006364 Py_DECREF(result);
6365 return NULL;
6366 }
6367 }
6368 return result;
6369}
6370
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006371/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006374PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006380 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382
Ezio Melottie7f90372012-10-05 03:33:31 +03006383 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006384 escape.
6385
Ezio Melottie7f90372012-10-05 03:33:31 +03006386 For UCS1 strings it's '\xxx', 4 bytes per source character.
6387 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6388 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006389 */
6390
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391 if (!PyUnicode_Check(unicode)) {
6392 PyErr_BadArgument();
6393 return NULL;
6394 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 }
Victor Stinner358af132015-10-12 22:36:57 +02006398
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006399 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 if (len == 0) {
6401 return PyBytes_FromStringAndSize(NULL, 0);
6402 }
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404 kind = PyUnicode_KIND(unicode);
6405 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6407 bytes, and 1 byte characters 4. */
6408 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006409 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 return PyErr_NoMemory();
6411 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006412 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 if (repr == NULL) {
6414 return NULL;
6415 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006419 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006420
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 /* U+0000-U+00ff range */
6422 if (ch < 0x100) {
6423 if (ch >= ' ' && ch < 127) {
6424 if (ch != '\\') {
6425 /* Copy printable US ASCII as-is */
6426 *p++ = (char) ch;
6427 }
6428 /* Escape backslashes */
6429 else {
6430 *p++ = '\\';
6431 *p++ = '\\';
6432 }
6433 }
Victor Stinner358af132015-10-12 22:36:57 +02006434
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 /* Map special whitespace to '\t', \n', '\r' */
6436 else if (ch == '\t') {
6437 *p++ = '\\';
6438 *p++ = 't';
6439 }
6440 else if (ch == '\n') {
6441 *p++ = '\\';
6442 *p++ = 'n';
6443 }
6444 else if (ch == '\r') {
6445 *p++ = '\\';
6446 *p++ = 'r';
6447 }
6448
6449 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6450 else {
6451 *p++ = '\\';
6452 *p++ = 'x';
6453 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6454 *p++ = Py_hexdigits[ch & 0x000F];
6455 }
Tim Petersced69f82003-09-16 20:30:58 +00006456 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006457 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 *p++ = '\\';
6460 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006461 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6464 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6467 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006468
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 /* Make sure that the first two digits are zero */
6470 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006471 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 *p++ = 'U';
6473 *p++ = '0';
6474 *p++ = '0';
6475 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6476 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6477 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6478 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6479 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6480 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 assert(p - PyBytes_AS_STRING(repr) > 0);
6485 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6486 return NULL;
6487 }
6488 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6493 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006495 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006496 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 }
6500
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006501 result = PyUnicode_AsUnicodeEscapeString(tmp);
6502 Py_DECREF(tmp);
6503 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
6506/* --- Raw Unicode Escape Codec ------------------------------------------- */
6507
Alexander Belopolsky40018472011-02-26 01:02:56 +00006508PyObject *
6509PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006510 Py_ssize_t size,
6511 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006514 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 PyObject *errorHandler = NULL;
6517 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006518
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006520 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006522
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 /* Escaped strings will always be longer than the resulting
6524 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006525 length after conversion to the true value. (But decoding error
6526 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006527 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006528 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6530 goto onError;
6531 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 end = s + size;
6534 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 unsigned char c = (unsigned char) *s++;
6536 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006537 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006538 Py_ssize_t startinpos;
6539 Py_ssize_t endinpos;
6540 const char *message;
6541
6542#define WRITE_CHAR(ch) \
6543 do { \
6544 if (ch <= writer.maxchar) { \
6545 assert(writer.pos < writer.size); \
6546 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6547 } \
6548 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6549 goto onError; \
6550 } \
6551 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 if (c != '\\' || s >= end) {
6555 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558
Victor Stinner62ec3312016-09-06 17:04:34 -07006559 c = (unsigned char) *s++;
6560 if (c == 'u') {
6561 count = 4;
6562 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006564 else if (c == 'U') {
6565 count = 8;
6566 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006567 }
6568 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 assert(writer.pos < writer.size);
6570 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6571 WRITE_CHAR(c);
6572 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006573 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006574 startinpos = s - starts - 2;
6575
6576 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6577 for (ch = 0; count && s < end; ++s, --count) {
6578 c = (unsigned char)*s;
6579 ch <<= 4;
6580 if (c >= '0' && c <= '9') {
6581 ch += c - '0';
6582 }
6583 else if (c >= 'a' && c <= 'f') {
6584 ch += c - ('a' - 10);
6585 }
6586 else if (c >= 'A' && c <= 'F') {
6587 ch += c - ('A' - 10);
6588 }
6589 else {
6590 break;
6591 }
6592 }
6593 if (!count) {
6594 if (ch <= MAX_UNICODE) {
6595 WRITE_CHAR(ch);
6596 continue;
6597 }
6598 message = "\\Uxxxxxxxx out of range";
6599 }
6600
6601 endinpos = s-starts;
6602 writer.min_length = end - s + writer.pos;
6603 if (unicode_decode_call_errorhandler_writer(
6604 errors, &errorHandler,
6605 "rawunicodeescape", message,
6606 &starts, &end, &startinpos, &endinpos, &exc, &s,
6607 &writer)) {
6608 goto onError;
6609 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006610 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006611
6612#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 Py_XDECREF(errorHandler);
6615 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006616 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006617
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006619 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 Py_XDECREF(errorHandler);
6621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633 int kind;
6634 void *data;
6635 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006637 if (!PyUnicode_Check(unicode)) {
6638 PyErr_BadArgument();
6639 return NULL;
6640 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006642 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 kind = PyUnicode_KIND(unicode);
6645 data = PyUnicode_DATA(unicode);
6646 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 if (kind == PyUnicode_1BYTE_KIND) {
6648 return PyBytes_FromStringAndSize(data, len);
6649 }
Victor Stinner0e368262011-11-10 20:12:49 +01006650
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6652 bytes, and 1 byte characters 4. */
6653 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006654
Victor Stinner62ec3312016-09-06 17:04:34 -07006655 if (len > PY_SSIZE_T_MAX / expandsize) {
6656 return PyErr_NoMemory();
6657 }
6658 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6659 if (repr == NULL) {
6660 return NULL;
6661 }
6662 if (len == 0) {
6663 return repr;
6664 }
6665
6666 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006667 for (pos = 0; pos < len; pos++) {
6668 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006669
Victor Stinner62ec3312016-09-06 17:04:34 -07006670 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6671 if (ch < 0x100) {
6672 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006673 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006674 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006675 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 *p++ = '\\';
6677 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006678 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6679 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6680 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6681 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6684 else {
6685 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6686 *p++ = '\\';
6687 *p++ = 'U';
6688 *p++ = '0';
6689 *p++ = '0';
6690 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6691 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6692 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6693 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6694 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6695 *p++ = Py_hexdigits[ch & 15];
6696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006698
Victor Stinner62ec3312016-09-06 17:04:34 -07006699 assert(p > PyBytes_AS_STRING(repr));
6700 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6701 return NULL;
6702 }
6703 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006707PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6708 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006710 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006711 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006712 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006713 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6715 Py_DECREF(tmp);
6716 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
6719/* --- Latin-1 Codec ------------------------------------------------------ */
6720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721PyObject *
6722PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006723 Py_ssize_t size,
6724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006727 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731static void
6732make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006733 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006734 PyObject *unicode,
6735 Py_ssize_t startpos, Py_ssize_t endpos,
6736 const char *reason)
6737{
6738 if (*exceptionObject == NULL) {
6739 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006741 encoding, unicode, startpos, endpos, reason);
6742 }
6743 else {
6744 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6745 goto onError;
6746 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6747 goto onError;
6748 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6749 goto onError;
6750 return;
6751 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006752 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006753 }
6754}
6755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757static void
6758raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006759 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006760 PyObject *unicode,
6761 Py_ssize_t startpos, Py_ssize_t endpos,
6762 const char *reason)
6763{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006764 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006765 encoding, unicode, startpos, endpos, reason);
6766 if (*exceptionObject != NULL)
6767 PyCodec_StrictErrors(*exceptionObject);
6768}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769
6770/* error handling callback helper:
6771 build arguments, call the callback and check the arguments,
6772 put the result into newpos and return the replacement string, which
6773 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006774static PyObject *
6775unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006776 PyObject **errorHandler,
6777 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006779 Py_ssize_t startpos, Py_ssize_t endpos,
6780 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006782 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 PyObject *restuple;
6785 PyObject *resunicode;
6786
6787 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 }
6792
Benjamin Petersonbac79492012-01-14 13:34:47 -05006793 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 return NULL;
6795 len = PyUnicode_GET_LENGTH(unicode);
6796
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006797 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006802 restuple = PyObject_CallFunctionObjArgs(
6803 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006807 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 Py_DECREF(restuple);
6809 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006811 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 &resunicode, newpos)) {
6813 Py_DECREF(restuple);
6814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006815 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006816 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6817 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6818 Py_DECREF(restuple);
6819 return NULL;
6820 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006821 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006822 *newpos = len + *newpos;
6823 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006824 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 Py_DECREF(restuple);
6826 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006827 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006828 Py_INCREF(resunicode);
6829 Py_DECREF(restuple);
6830 return resunicode;
6831}
6832
Alexander Belopolsky40018472011-02-26 01:02:56 +00006833static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006835 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006836 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006838 /* input state */
6839 Py_ssize_t pos=0, size;
6840 int kind;
6841 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 /* pointer into the output */
6843 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006844 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6845 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006846 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006848 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006849 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006850 /* output object */
6851 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852
Benjamin Petersonbac79492012-01-14 13:34:47 -05006853 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006854 return NULL;
6855 size = PyUnicode_GET_LENGTH(unicode);
6856 kind = PyUnicode_KIND(unicode);
6857 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 /* allocate enough for a simple encoding without
6859 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006860 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006861 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006862
6863 _PyBytesWriter_Init(&writer);
6864 str = _PyBytesWriter_Alloc(&writer, size);
6865 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006866 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006869 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006870
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006872 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006874 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006875 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006876 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006878 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006880 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006881 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006883
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006884 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006886
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006887 /* Only overallocate the buffer if it's not the last write */
6888 writer.overallocate = (collend < size);
6889
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006891 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006892 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006893
6894 switch (error_handler) {
6895 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006896 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006898
6899 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006900 memset(str, '?', collend - collstart);
6901 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006902 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006903 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006904 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 break;
Victor Stinner50149202015-09-22 00:26:54 +02006906
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006907 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006908 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006909 writer.min_size -= (collend - collstart);
6910 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006911 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006912 if (str == NULL)
6913 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006914 pos = collend;
6915 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006916
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006917 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006918 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006919 writer.min_size -= (collend - collstart);
6920 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006921 unicode, collstart, collend);
6922 if (str == NULL)
6923 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 break;
Victor Stinner50149202015-09-22 00:26:54 +02006926
Victor Stinnerc3713e92015-09-29 12:32:13 +02006927 case _Py_ERROR_SURROGATEESCAPE:
6928 for (i = collstart; i < collend; ++i) {
6929 ch = PyUnicode_READ(kind, data, i);
6930 if (ch < 0xdc80 || 0xdcff < ch) {
6931 /* Not a UTF-8b surrogate */
6932 break;
6933 }
6934 *str++ = (char)(ch - 0xdc00);
6935 ++pos;
6936 }
6937 if (i >= collend)
6938 break;
6939 collstart = pos;
6940 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006941 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006942
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006944 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6945 encoding, reason, unicode, &exc,
6946 collstart, collend, &newpos);
6947 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006949
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006950 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006951 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006952
Victor Stinner6bd525b2015-10-09 13:10:05 +02006953 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006954 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006955 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006956 PyBytes_AS_STRING(rep),
6957 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006958 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006959 else {
6960 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006961
Victor Stinner6bd525b2015-10-09 13:10:05 +02006962 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006963 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006964
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006965 if (limit == 256 ?
6966 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6967 !PyUnicode_IS_ASCII(rep))
6968 {
6969 /* Not all characters are smaller than limit */
6970 raise_encode_exception(&exc, encoding, unicode,
6971 collstart, collend, reason);
6972 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006974 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6975 str = _PyBytesWriter_WriteBytes(&writer, str,
6976 PyUnicode_DATA(rep),
6977 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006978 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006979 if (str == NULL)
6980 goto onError;
6981
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006982 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006983 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006984 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006985
6986 /* If overallocation was disabled, ensure that it was the last
6987 write. Otherwise, we missed an optimization */
6988 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006989 }
6990 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006991
Victor Stinner50149202015-09-22 00:26:54 +02006992 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006993 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006994 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006995
6996 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006997 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006998 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006999 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007000 Py_XDECREF(exc);
7001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002}
7003
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007004/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007005PyObject *
7006PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007007 Py_ssize_t size,
7008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007011 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007012 if (unicode == NULL)
7013 return NULL;
7014 result = unicode_encode_ucs1(unicode, errors, 256);
7015 Py_DECREF(unicode);
7016 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017}
7018
Alexander Belopolsky40018472011-02-26 01:02:56 +00007019PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007020_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021{
7022 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 PyErr_BadArgument();
7024 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026 if (PyUnicode_READY(unicode) == -1)
7027 return NULL;
7028 /* Fast path: if it is a one-byte string, construct
7029 bytes object directly. */
7030 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7031 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7032 PyUnicode_GET_LENGTH(unicode));
7033 /* Non-Latin-1 characters present. Defer to above function to
7034 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007035 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007036}
7037
7038PyObject*
7039PyUnicode_AsLatin1String(PyObject *unicode)
7040{
7041 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042}
7043
7044/* --- 7-bit ASCII Codec -------------------------------------------------- */
7045
Alexander Belopolsky40018472011-02-26 01:02:56 +00007046PyObject *
7047PyUnicode_DecodeASCII(const char *s,
7048 Py_ssize_t size,
7049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007051 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007052 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007053 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007055 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007056
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007058 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007059
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007061 if (size == 1 && (unsigned char)s[0] < 128)
7062 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007063
Inada Naoki770847a2019-06-24 12:30:24 +09007064 // Shortcut for simple case
7065 PyObject *u = PyUnicode_New(size, 127);
7066 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007067 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007068 }
7069 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7070 if (outpos == size) {
7071 return u;
7072 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007073
Inada Naoki770847a2019-06-24 12:30:24 +09007074 _PyUnicodeWriter writer;
7075 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007076 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007077
Inada Naoki770847a2019-06-24 12:30:24 +09007078 s += outpos;
7079 int kind = writer.kind;
7080 void *data = writer.data;
7081 Py_ssize_t startinpos, endinpos;
7082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007083 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007084 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007085 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007086 PyUnicode_WRITE(kind, data, writer.pos, c);
7087 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007088 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007089 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007091
7092 /* byte outsize range 0x00..0x7f: call the error handler */
7093
7094 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007095 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007096
7097 switch (error_handler)
7098 {
7099 case _Py_ERROR_REPLACE:
7100 case _Py_ERROR_SURROGATEESCAPE:
7101 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007102 but we may switch to UCS2 at the first write */
7103 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7104 goto onError;
7105 kind = writer.kind;
7106 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007107
7108 if (error_handler == _Py_ERROR_REPLACE)
7109 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7110 else
7111 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7112 writer.pos++;
7113 ++s;
7114 break;
7115
7116 case _Py_ERROR_IGNORE:
7117 ++s;
7118 break;
7119
7120 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 startinpos = s-starts;
7122 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007123 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007124 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 "ascii", "ordinal not in range(128)",
7126 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007127 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007128 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007129 kind = writer.kind;
7130 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007133 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007135 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007136
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007138 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007139 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 return NULL;
7142}
7143
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007144/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007145PyObject *
7146PyUnicode_EncodeASCII(const Py_UNICODE *p,
7147 Py_ssize_t size,
7148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007150 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007151 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007152 if (unicode == NULL)
7153 return NULL;
7154 result = unicode_encode_ucs1(unicode, errors, 128);
7155 Py_DECREF(unicode);
7156 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157}
7158
Alexander Belopolsky40018472011-02-26 01:02:56 +00007159PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007160_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161{
7162 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 PyErr_BadArgument();
7164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007166 if (PyUnicode_READY(unicode) == -1)
7167 return NULL;
7168 /* Fast path: if it is an ASCII-only string, construct bytes object
7169 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007170 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007171 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7172 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007173 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007174}
7175
7176PyObject *
7177PyUnicode_AsASCIIString(PyObject *unicode)
7178{
7179 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180}
7181
Steve Dowercc16be82016-09-08 10:35:16 -07007182#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007183
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007184/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007185
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007186#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187#define NEED_RETRY
7188#endif
7189
Victor Stinner3a50e702011-10-18 21:21:00 +02007190#ifndef WC_ERR_INVALID_CHARS
7191# define WC_ERR_INVALID_CHARS 0x0080
7192#endif
7193
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007194static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007195code_page_name(UINT code_page, PyObject **obj)
7196{
7197 *obj = NULL;
7198 if (code_page == CP_ACP)
7199 return "mbcs";
7200 if (code_page == CP_UTF7)
7201 return "CP_UTF7";
7202 if (code_page == CP_UTF8)
7203 return "CP_UTF8";
7204
7205 *obj = PyBytes_FromFormat("cp%u", code_page);
7206 if (*obj == NULL)
7207 return NULL;
7208 return PyBytes_AS_STRING(*obj);
7209}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210
Victor Stinner3a50e702011-10-18 21:21:00 +02007211static DWORD
7212decode_code_page_flags(UINT code_page)
7213{
7214 if (code_page == CP_UTF7) {
7215 /* The CP_UTF7 decoder only supports flags=0 */
7216 return 0;
7217 }
7218 else
7219 return MB_ERR_INVALID_CHARS;
7220}
7221
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 * Decode a byte string from a Windows code page into unicode object in strict
7224 * mode.
7225 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007226 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7227 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007228 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007229static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007230decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007231 wchar_t **buf,
7232 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 const char *in,
7234 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007236 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007237 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239
7240 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007242 while ((outsize = MultiByteToWideChar(code_page, flags,
7243 in, insize, NULL, 0)) <= 0)
7244 {
7245 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7246 goto error;
7247 }
7248 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7249 flags = 0;
7250 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007251
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007252 /* Extend a wchar_t* buffer */
7253 Py_ssize_t n = *bufsize; /* Get the current length */
7254 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7255 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007257 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258
7259 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7261 if (outsize <= 0)
7262 goto error;
7263 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007264
Victor Stinner3a50e702011-10-18 21:21:00 +02007265error:
7266 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7267 return -2;
7268 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007269 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007270}
7271
Victor Stinner3a50e702011-10-18 21:21:00 +02007272/*
7273 * Decode a byte string from a code page into unicode object with an error
7274 * handler.
7275 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007276 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 * UnicodeDecodeError exception and returns -1 on error.
7278 */
7279static int
7280decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007281 wchar_t **buf,
7282 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007283 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007284 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007285{
7286 const char *startin = in;
7287 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007288 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 /* Ideally, we should get reason from FormatMessage. This is the Windows
7290 2000 English version of the message. */
7291 const char *reason = "No mapping for the Unicode character exists "
7292 "in the target code page.";
7293 /* each step cannot decode more than 1 character, but a character can be
7294 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007295 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007296 int insize;
7297 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 PyObject *errorHandler = NULL;
7299 PyObject *exc = NULL;
7300 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007301 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 DWORD err;
7303 int ret = -1;
7304
7305 assert(size > 0);
7306
7307 encoding = code_page_name(code_page, &encoding_obj);
7308 if (encoding == NULL)
7309 return -1;
7310
Victor Stinner7d00cc12014-03-17 23:08:06 +01007311 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7313 UnicodeDecodeError. */
7314 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7315 if (exc != NULL) {
7316 PyCodec_StrictErrors(exc);
7317 Py_CLEAR(exc);
7318 }
7319 goto error;
7320 }
7321
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007322 /* Extend a wchar_t* buffer */
7323 Py_ssize_t n = *bufsize; /* Get the current length */
7324 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7325 PyErr_NoMemory();
7326 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007328 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7329 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007331 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007332
7333 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 while (in < endin)
7335 {
7336 /* Decode a character */
7337 insize = 1;
7338 do
7339 {
7340 outsize = MultiByteToWideChar(code_page, flags,
7341 in, insize,
7342 buffer, Py_ARRAY_LENGTH(buffer));
7343 if (outsize > 0)
7344 break;
7345 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007346 if (err == ERROR_INVALID_FLAGS && flags) {
7347 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7348 flags = 0;
7349 continue;
7350 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 if (err != ERROR_NO_UNICODE_TRANSLATION
7352 && err != ERROR_INSUFFICIENT_BUFFER)
7353 {
7354 PyErr_SetFromWindowsErr(0);
7355 goto error;
7356 }
7357 insize++;
7358 }
7359 /* 4=maximum length of a UTF-8 sequence */
7360 while (insize <= 4 && (in + insize) <= endin);
7361
7362 if (outsize <= 0) {
7363 Py_ssize_t startinpos, endinpos, outpos;
7364
Victor Stinner7d00cc12014-03-17 23:08:06 +01007365 /* last character in partial decode? */
7366 if (in + insize >= endin && !final)
7367 break;
7368
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 startinpos = in - startin;
7370 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007371 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007372 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 errors, &errorHandler,
7374 encoding, reason,
7375 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007376 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 {
7378 goto error;
7379 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007380 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 }
7382 else {
7383 in += insize;
7384 memcpy(out, buffer, outsize * sizeof(wchar_t));
7385 out += outsize;
7386 }
7387 }
7388
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007389 /* Shrink the buffer */
7390 assert(out - *buf <= *bufsize);
7391 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007392 /* (in - startin) <= size and size is an int */
7393 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007394
7395error:
7396 Py_XDECREF(encoding_obj);
7397 Py_XDECREF(errorHandler);
7398 Py_XDECREF(exc);
7399 return ret;
7400}
7401
Victor Stinner3a50e702011-10-18 21:21:00 +02007402static PyObject *
7403decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007404 const char *s, Py_ssize_t size,
7405 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007407 wchar_t *buf = NULL;
7408 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007409 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007410
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 if (code_page < 0) {
7412 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7413 return NULL;
7414 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007415 if (size < 0) {
7416 PyErr_BadInternalCall();
7417 return NULL;
7418 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007419
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007420 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422
Victor Stinner76a31a62011-11-04 00:05:13 +01007423 do
7424 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007426 if (size > INT_MAX) {
7427 chunk_size = INT_MAX;
7428 final = 0;
7429 done = 0;
7430 }
7431 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007433 {
7434 chunk_size = (int)size;
7435 final = (consumed == NULL);
7436 done = 1;
7437 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438
Victor Stinner76a31a62011-11-04 00:05:13 +01007439 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007440 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007441 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007442 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007445 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007446 s, chunk_size);
7447 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007448 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007449 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007450 errors, final);
7451 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007452
7453 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007454 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 return NULL;
7456 }
7457
7458 if (consumed)
7459 *consumed += converted;
7460
7461 s += converted;
7462 size -= converted;
7463 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007464
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7466 PyMem_Free(buf);
7467 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468}
7469
Alexander Belopolsky40018472011-02-26 01:02:56 +00007470PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007471PyUnicode_DecodeCodePageStateful(int code_page,
7472 const char *s,
7473 Py_ssize_t size,
7474 const char *errors,
7475 Py_ssize_t *consumed)
7476{
7477 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7478}
7479
7480PyObject *
7481PyUnicode_DecodeMBCSStateful(const char *s,
7482 Py_ssize_t size,
7483 const char *errors,
7484 Py_ssize_t *consumed)
7485{
7486 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7487}
7488
7489PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007490PyUnicode_DecodeMBCS(const char *s,
7491 Py_ssize_t size,
7492 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007493{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7495}
7496
Victor Stinner3a50e702011-10-18 21:21:00 +02007497static DWORD
7498encode_code_page_flags(UINT code_page, const char *errors)
7499{
7500 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007501 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 }
7503 else if (code_page == CP_UTF7) {
7504 /* CP_UTF7 only supports flags=0 */
7505 return 0;
7506 }
7507 else {
7508 if (errors != NULL && strcmp(errors, "replace") == 0)
7509 return 0;
7510 else
7511 return WC_NO_BEST_FIT_CHARS;
7512 }
7513}
7514
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 * Encode a Unicode string to a Windows code page into a byte string in strict
7517 * mode.
7518 *
7519 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007520 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007522static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007523encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007524 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007526{
Victor Stinner554f3f02010-06-16 23:33:54 +00007527 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 BOOL *pusedDefaultChar = &usedDefaultChar;
7529 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007530 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007531 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 const DWORD flags = encode_code_page_flags(code_page, NULL);
7533 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007534 /* Create a substring so that we can get the UTF-16 representation
7535 of just the slice under consideration. */
7536 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537
Martin v. Löwis3d325192011-11-04 18:23:06 +01007538 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007539
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007541 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007542 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007543 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007544
Victor Stinner2fc507f2011-11-04 20:06:39 +01007545 substring = PyUnicode_Substring(unicode, offset, offset+len);
7546 if (substring == NULL)
7547 return -1;
7548 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7549 if (p == NULL) {
7550 Py_DECREF(substring);
7551 return -1;
7552 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007553 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007554
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007555 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007557 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 NULL, 0,
7559 NULL, pusedDefaultChar);
7560 if (outsize <= 0)
7561 goto error;
7562 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007563 if (pusedDefaultChar && *pusedDefaultChar) {
7564 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007566 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007567
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007571 if (*outbytes == NULL) {
7572 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007574 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007576 }
7577 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 const Py_ssize_t n = PyBytes_Size(*outbytes);
7580 if (outsize > PY_SSIZE_T_MAX - n) {
7581 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007582 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007585 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7586 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007588 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007589 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590 }
7591
7592 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007594 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 out, outsize,
7596 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007597 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007598 if (outsize <= 0)
7599 goto error;
7600 if (pusedDefaultChar && *pusedDefaultChar)
7601 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007603
Victor Stinner3a50e702011-10-18 21:21:00 +02007604error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007605 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7607 return -2;
7608 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007609 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007610}
7611
Victor Stinner3a50e702011-10-18 21:21:00 +02007612/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007613 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 * error handler.
7615 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007616 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 * -1 on other error.
7618 */
7619static int
7620encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007621 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007622 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007623{
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 Py_ssize_t pos = unicode_offset;
7626 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 /* Ideally, we should get reason from FormatMessage. This is the Windows
7628 2000 English version of the message. */
7629 const char *reason = "invalid character";
7630 /* 4=maximum length of a UTF-8 sequence */
7631 char buffer[4];
7632 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7633 Py_ssize_t outsize;
7634 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007635 PyObject *errorHandler = NULL;
7636 PyObject *exc = NULL;
7637 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007638 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 PyObject *rep;
7641 int ret = -1;
7642
7643 assert(insize > 0);
7644
7645 encoding = code_page_name(code_page, &encoding_obj);
7646 if (encoding == NULL)
7647 return -1;
7648
7649 if (errors == NULL || strcmp(errors, "strict") == 0) {
7650 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7651 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007652 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 if (exc != NULL) {
7654 PyCodec_StrictErrors(exc);
7655 Py_DECREF(exc);
7656 }
7657 Py_XDECREF(encoding_obj);
7658 return -1;
7659 }
7660
7661 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7662 pusedDefaultChar = &usedDefaultChar;
7663 else
7664 pusedDefaultChar = NULL;
7665
7666 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7667 PyErr_NoMemory();
7668 goto error;
7669 }
7670 outsize = insize * Py_ARRAY_LENGTH(buffer);
7671
7672 if (*outbytes == NULL) {
7673 /* Create string object */
7674 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7675 if (*outbytes == NULL)
7676 goto error;
7677 out = PyBytes_AS_STRING(*outbytes);
7678 }
7679 else {
7680 /* Extend string object */
7681 Py_ssize_t n = PyBytes_Size(*outbytes);
7682 if (n > PY_SSIZE_T_MAX - outsize) {
7683 PyErr_NoMemory();
7684 goto error;
7685 }
7686 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7687 goto error;
7688 out = PyBytes_AS_STRING(*outbytes) + n;
7689 }
7690
7691 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007694 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7695 wchar_t chars[2];
7696 int charsize;
7697 if (ch < 0x10000) {
7698 chars[0] = (wchar_t)ch;
7699 charsize = 1;
7700 }
7701 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007702 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7703 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007704 charsize = 2;
7705 }
7706
Victor Stinner3a50e702011-10-18 21:21:00 +02007707 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007708 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 buffer, Py_ARRAY_LENGTH(buffer),
7710 NULL, pusedDefaultChar);
7711 if (outsize > 0) {
7712 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7713 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007714 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007715 memcpy(out, buffer, outsize);
7716 out += outsize;
7717 continue;
7718 }
7719 }
7720 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7721 PyErr_SetFromWindowsErr(0);
7722 goto error;
7723 }
7724
Victor Stinner3a50e702011-10-18 21:21:00 +02007725 rep = unicode_encode_call_errorhandler(
7726 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007727 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007728 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 if (rep == NULL)
7730 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007731 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007732
7733 if (PyBytes_Check(rep)) {
7734 outsize = PyBytes_GET_SIZE(rep);
7735 if (outsize != 1) {
7736 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7737 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7738 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7739 Py_DECREF(rep);
7740 goto error;
7741 }
7742 out = PyBytes_AS_STRING(*outbytes) + offset;
7743 }
7744 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7745 out += outsize;
7746 }
7747 else {
7748 Py_ssize_t i;
7749 enum PyUnicode_Kind kind;
7750 void *data;
7751
Benjamin Petersonbac79492012-01-14 13:34:47 -05007752 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 Py_DECREF(rep);
7754 goto error;
7755 }
7756
7757 outsize = PyUnicode_GET_LENGTH(rep);
7758 if (outsize != 1) {
7759 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7760 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7761 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7762 Py_DECREF(rep);
7763 goto error;
7764 }
7765 out = PyBytes_AS_STRING(*outbytes) + offset;
7766 }
7767 kind = PyUnicode_KIND(rep);
7768 data = PyUnicode_DATA(rep);
7769 for (i=0; i < outsize; i++) {
7770 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7771 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007772 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007773 encoding, unicode,
7774 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007775 "unable to encode error handler result to ASCII");
7776 Py_DECREF(rep);
7777 goto error;
7778 }
7779 *out = (unsigned char)ch;
7780 out++;
7781 }
7782 }
7783 Py_DECREF(rep);
7784 }
7785 /* write a NUL byte */
7786 *out = 0;
7787 outsize = out - PyBytes_AS_STRING(*outbytes);
7788 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7789 if (_PyBytes_Resize(outbytes, outsize) < 0)
7790 goto error;
7791 ret = 0;
7792
7793error:
7794 Py_XDECREF(encoding_obj);
7795 Py_XDECREF(errorHandler);
7796 Py_XDECREF(exc);
7797 return ret;
7798}
7799
Victor Stinner3a50e702011-10-18 21:21:00 +02007800static PyObject *
7801encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007802 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007803 const char *errors)
7804{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007805 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007806 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007807 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007808 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007809
Victor Stinner29dacf22015-01-26 16:41:32 +01007810 if (!PyUnicode_Check(unicode)) {
7811 PyErr_BadArgument();
7812 return NULL;
7813 }
7814
Benjamin Petersonbac79492012-01-14 13:34:47 -05007815 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007816 return NULL;
7817 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007818
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 if (code_page < 0) {
7820 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7821 return NULL;
7822 }
7823
Martin v. Löwis3d325192011-11-04 18:23:06 +01007824 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007825 return PyBytes_FromStringAndSize(NULL, 0);
7826
Victor Stinner7581cef2011-11-03 22:32:33 +01007827 offset = 0;
7828 do
7829 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007830#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007831 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007832 chunks. */
7833 if (len > INT_MAX/2) {
7834 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007835 done = 0;
7836 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007837 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007838#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007839 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007840 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007841 done = 1;
7842 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007843
Victor Stinner76a31a62011-11-04 00:05:13 +01007844 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007845 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007846 errors);
7847 if (ret == -2)
7848 ret = encode_code_page_errors(code_page, &outbytes,
7849 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007850 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007851 if (ret < 0) {
7852 Py_XDECREF(outbytes);
7853 return NULL;
7854 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007855
Victor Stinner7581cef2011-11-03 22:32:33 +01007856 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007857 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007858 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007859
Victor Stinner3a50e702011-10-18 21:21:00 +02007860 return outbytes;
7861}
7862
7863PyObject *
7864PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7865 Py_ssize_t size,
7866 const char *errors)
7867{
Victor Stinner7581cef2011-11-03 22:32:33 +01007868 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007869 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007870 if (unicode == NULL)
7871 return NULL;
7872 res = encode_code_page(CP_ACP, unicode, errors);
7873 Py_DECREF(unicode);
7874 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007875}
7876
7877PyObject *
7878PyUnicode_EncodeCodePage(int code_page,
7879 PyObject *unicode,
7880 const char *errors)
7881{
Victor Stinner7581cef2011-11-03 22:32:33 +01007882 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007883}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007884
Alexander Belopolsky40018472011-02-26 01:02:56 +00007885PyObject *
7886PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007887{
Victor Stinner7581cef2011-11-03 22:32:33 +01007888 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007889}
7890
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007891#undef NEED_RETRY
7892
Steve Dowercc16be82016-09-08 10:35:16 -07007893#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007894
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895/* --- Character Mapping Codec -------------------------------------------- */
7896
Victor Stinnerfb161b12013-04-18 01:44:27 +02007897static int
7898charmap_decode_string(const char *s,
7899 Py_ssize_t size,
7900 PyObject *mapping,
7901 const char *errors,
7902 _PyUnicodeWriter *writer)
7903{
7904 const char *starts = s;
7905 const char *e;
7906 Py_ssize_t startinpos, endinpos;
7907 PyObject *errorHandler = NULL, *exc = NULL;
7908 Py_ssize_t maplen;
7909 enum PyUnicode_Kind mapkind;
7910 void *mapdata;
7911 Py_UCS4 x;
7912 unsigned char ch;
7913
7914 if (PyUnicode_READY(mapping) == -1)
7915 return -1;
7916
7917 maplen = PyUnicode_GET_LENGTH(mapping);
7918 mapdata = PyUnicode_DATA(mapping);
7919 mapkind = PyUnicode_KIND(mapping);
7920
7921 e = s + size;
7922
7923 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7924 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7925 * is disabled in encoding aliases, latin1 is preferred because
7926 * its implementation is faster. */
7927 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7928 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7929 Py_UCS4 maxchar = writer->maxchar;
7930
7931 assert (writer->kind == PyUnicode_1BYTE_KIND);
7932 while (s < e) {
7933 ch = *s;
7934 x = mapdata_ucs1[ch];
7935 if (x > maxchar) {
7936 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7937 goto onError;
7938 maxchar = writer->maxchar;
7939 outdata = (Py_UCS1 *)writer->data;
7940 }
7941 outdata[writer->pos] = x;
7942 writer->pos++;
7943 ++s;
7944 }
7945 return 0;
7946 }
7947
7948 while (s < e) {
7949 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7950 enum PyUnicode_Kind outkind = writer->kind;
7951 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7952 if (outkind == PyUnicode_1BYTE_KIND) {
7953 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7954 Py_UCS4 maxchar = writer->maxchar;
7955 while (s < e) {
7956 ch = *s;
7957 x = mapdata_ucs2[ch];
7958 if (x > maxchar)
7959 goto Error;
7960 outdata[writer->pos] = x;
7961 writer->pos++;
7962 ++s;
7963 }
7964 break;
7965 }
7966 else if (outkind == PyUnicode_2BYTE_KIND) {
7967 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7968 while (s < e) {
7969 ch = *s;
7970 x = mapdata_ucs2[ch];
7971 if (x == 0xFFFE)
7972 goto Error;
7973 outdata[writer->pos] = x;
7974 writer->pos++;
7975 ++s;
7976 }
7977 break;
7978 }
7979 }
7980 ch = *s;
7981
7982 if (ch < maplen)
7983 x = PyUnicode_READ(mapkind, mapdata, ch);
7984 else
7985 x = 0xfffe; /* invalid value */
7986Error:
7987 if (x == 0xfffe)
7988 {
7989 /* undefined mapping */
7990 startinpos = s-starts;
7991 endinpos = startinpos+1;
7992 if (unicode_decode_call_errorhandler_writer(
7993 errors, &errorHandler,
7994 "charmap", "character maps to <undefined>",
7995 &starts, &e, &startinpos, &endinpos, &exc, &s,
7996 writer)) {
7997 goto onError;
7998 }
7999 continue;
8000 }
8001
8002 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8003 goto onError;
8004 ++s;
8005 }
8006 Py_XDECREF(errorHandler);
8007 Py_XDECREF(exc);
8008 return 0;
8009
8010onError:
8011 Py_XDECREF(errorHandler);
8012 Py_XDECREF(exc);
8013 return -1;
8014}
8015
8016static int
8017charmap_decode_mapping(const char *s,
8018 Py_ssize_t size,
8019 PyObject *mapping,
8020 const char *errors,
8021 _PyUnicodeWriter *writer)
8022{
8023 const char *starts = s;
8024 const char *e;
8025 Py_ssize_t startinpos, endinpos;
8026 PyObject *errorHandler = NULL, *exc = NULL;
8027 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008028 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008029
8030 e = s + size;
8031
8032 while (s < e) {
8033 ch = *s;
8034
8035 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8036 key = PyLong_FromLong((long)ch);
8037 if (key == NULL)
8038 goto onError;
8039
8040 item = PyObject_GetItem(mapping, key);
8041 Py_DECREF(key);
8042 if (item == NULL) {
8043 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8044 /* No mapping found means: mapping is undefined. */
8045 PyErr_Clear();
8046 goto Undefined;
8047 } else
8048 goto onError;
8049 }
8050
8051 /* Apply mapping */
8052 if (item == Py_None)
8053 goto Undefined;
8054 if (PyLong_Check(item)) {
8055 long value = PyLong_AS_LONG(item);
8056 if (value == 0xFFFE)
8057 goto Undefined;
8058 if (value < 0 || value > MAX_UNICODE) {
8059 PyErr_Format(PyExc_TypeError,
8060 "character mapping must be in range(0x%lx)",
8061 (unsigned long)MAX_UNICODE + 1);
8062 goto onError;
8063 }
8064
8065 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8066 goto onError;
8067 }
8068 else if (PyUnicode_Check(item)) {
8069 if (PyUnicode_READY(item) == -1)
8070 goto onError;
8071 if (PyUnicode_GET_LENGTH(item) == 1) {
8072 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8073 if (value == 0xFFFE)
8074 goto Undefined;
8075 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8076 goto onError;
8077 }
8078 else {
8079 writer->overallocate = 1;
8080 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8081 goto onError;
8082 }
8083 }
8084 else {
8085 /* wrong return value */
8086 PyErr_SetString(PyExc_TypeError,
8087 "character mapping must return integer, None or str");
8088 goto onError;
8089 }
8090 Py_CLEAR(item);
8091 ++s;
8092 continue;
8093
8094Undefined:
8095 /* undefined mapping */
8096 Py_CLEAR(item);
8097 startinpos = s-starts;
8098 endinpos = startinpos+1;
8099 if (unicode_decode_call_errorhandler_writer(
8100 errors, &errorHandler,
8101 "charmap", "character maps to <undefined>",
8102 &starts, &e, &startinpos, &endinpos, &exc, &s,
8103 writer)) {
8104 goto onError;
8105 }
8106 }
8107 Py_XDECREF(errorHandler);
8108 Py_XDECREF(exc);
8109 return 0;
8110
8111onError:
8112 Py_XDECREF(item);
8113 Py_XDECREF(errorHandler);
8114 Py_XDECREF(exc);
8115 return -1;
8116}
8117
Alexander Belopolsky40018472011-02-26 01:02:56 +00008118PyObject *
8119PyUnicode_DecodeCharmap(const char *s,
8120 Py_ssize_t size,
8121 PyObject *mapping,
8122 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008124 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008125
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 /* Default to Latin-1 */
8127 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008131 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008132 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008133 writer.min_length = size;
8134 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008136
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008137 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008138 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8139 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008140 }
8141 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008142 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8143 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008145 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008146
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008148 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 return NULL;
8150}
8151
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152/* Charmap encoding: the lookup table */
8153
Alexander Belopolsky40018472011-02-26 01:02:56 +00008154struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 PyObject_HEAD
8156 unsigned char level1[32];
8157 int count2, count3;
8158 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159};
8160
8161static PyObject*
8162encoding_map_size(PyObject *obj, PyObject* args)
8163{
8164 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167}
8168
8169static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 PyDoc_STR("Return the size (in bytes) of this object") },
8172 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173};
8174
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008175static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 "EncodingMap", /*tp_name*/
8178 sizeof(struct encoding_map), /*tp_basicsize*/
8179 0, /*tp_itemsize*/
8180 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008181 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008182 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 0, /*tp_getattr*/
8184 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008185 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 0, /*tp_repr*/
8187 0, /*tp_as_number*/
8188 0, /*tp_as_sequence*/
8189 0, /*tp_as_mapping*/
8190 0, /*tp_hash*/
8191 0, /*tp_call*/
8192 0, /*tp_str*/
8193 0, /*tp_getattro*/
8194 0, /*tp_setattro*/
8195 0, /*tp_as_buffer*/
8196 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8197 0, /*tp_doc*/
8198 0, /*tp_traverse*/
8199 0, /*tp_clear*/
8200 0, /*tp_richcompare*/
8201 0, /*tp_weaklistoffset*/
8202 0, /*tp_iter*/
8203 0, /*tp_iternext*/
8204 encoding_map_methods, /*tp_methods*/
8205 0, /*tp_members*/
8206 0, /*tp_getset*/
8207 0, /*tp_base*/
8208 0, /*tp_dict*/
8209 0, /*tp_descr_get*/
8210 0, /*tp_descr_set*/
8211 0, /*tp_dictoffset*/
8212 0, /*tp_init*/
8213 0, /*tp_alloc*/
8214 0, /*tp_new*/
8215 0, /*tp_free*/
8216 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217};
8218
8219PyObject*
8220PyUnicode_BuildEncodingMap(PyObject* string)
8221{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 PyObject *result;
8223 struct encoding_map *mresult;
8224 int i;
8225 int need_dict = 0;
8226 unsigned char level1[32];
8227 unsigned char level2[512];
8228 unsigned char *mlevel1, *mlevel2, *mlevel3;
8229 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 int kind;
8231 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008232 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008234
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008235 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236 PyErr_BadArgument();
8237 return NULL;
8238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 kind = PyUnicode_KIND(string);
8240 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008241 length = PyUnicode_GET_LENGTH(string);
8242 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008243 memset(level1, 0xFF, sizeof level1);
8244 memset(level2, 0xFF, sizeof level2);
8245
8246 /* If there isn't a one-to-one mapping of NULL to \0,
8247 or if there are non-BMP characters, we need to use
8248 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008251 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008252 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 ch = PyUnicode_READ(kind, data, i);
8254 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255 need_dict = 1;
8256 break;
8257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008259 /* unmapped character */
8260 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008261 l1 = ch >> 11;
8262 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008263 if (level1[l1] == 0xFF)
8264 level1[l1] = count2++;
8265 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008266 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267 }
8268
8269 if (count2 >= 0xFF || count3 >= 0xFF)
8270 need_dict = 1;
8271
8272 if (need_dict) {
8273 PyObject *result = PyDict_New();
8274 PyObject *key, *value;
8275 if (!result)
8276 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008277 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008279 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008280 if (!key || !value)
8281 goto failed1;
8282 if (PyDict_SetItem(result, key, value) == -1)
8283 goto failed1;
8284 Py_DECREF(key);
8285 Py_DECREF(value);
8286 }
8287 return result;
8288 failed1:
8289 Py_XDECREF(key);
8290 Py_XDECREF(value);
8291 Py_DECREF(result);
8292 return NULL;
8293 }
8294
8295 /* Create a three-level trie */
8296 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8297 16*count2 + 128*count3 - 1);
8298 if (!result)
8299 return PyErr_NoMemory();
8300 PyObject_Init(result, &EncodingMapType);
8301 mresult = (struct encoding_map*)result;
8302 mresult->count2 = count2;
8303 mresult->count3 = count3;
8304 mlevel1 = mresult->level1;
8305 mlevel2 = mresult->level23;
8306 mlevel3 = mresult->level23 + 16*count2;
8307 memcpy(mlevel1, level1, 32);
8308 memset(mlevel2, 0xFF, 16*count2);
8309 memset(mlevel3, 0, 128*count3);
8310 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008311 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008312 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008313 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8314 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 /* unmapped character */
8316 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008317 o1 = ch>>11;
8318 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 i2 = 16*mlevel1[o1] + o2;
8320 if (mlevel2[i2] == 0xFF)
8321 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008322 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008323 i3 = 128*mlevel2[i2] + o3;
8324 mlevel3[i3] = i;
8325 }
8326 return result;
8327}
8328
8329static int
Victor Stinner22168992011-11-20 17:09:18 +01008330encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331{
8332 struct encoding_map *map = (struct encoding_map*)mapping;
8333 int l1 = c>>11;
8334 int l2 = (c>>7) & 0xF;
8335 int l3 = c & 0x7F;
8336 int i;
8337
Victor Stinner22168992011-11-20 17:09:18 +01008338 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340 if (c == 0)
8341 return 0;
8342 /* level 1*/
8343 i = map->level1[l1];
8344 if (i == 0xFF) {
8345 return -1;
8346 }
8347 /* level 2*/
8348 i = map->level23[16*i+l2];
8349 if (i == 0xFF) {
8350 return -1;
8351 }
8352 /* level 3 */
8353 i = map->level23[16*map->count2 + 128*i + l3];
8354 if (i == 0) {
8355 return -1;
8356 }
8357 return i;
8358}
8359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360/* Lookup the character ch in the mapping. If the character
8361 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008362 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008364charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365{
Christian Heimes217cfd12007-12-02 14:31:20 +00008366 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 PyObject *x;
8368
8369 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 x = PyObject_GetItem(mapping, w);
8372 Py_DECREF(w);
8373 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8375 /* No mapping found means: mapping is undefined. */
8376 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008377 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 } else
8379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008381 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008383 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 long value = PyLong_AS_LONG(x);
8385 if (value < 0 || value > 255) {
8386 PyErr_SetString(PyExc_TypeError,
8387 "character mapping must be in range(256)");
8388 Py_DECREF(x);
8389 return NULL;
8390 }
8391 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008393 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 /* wrong return value */
8397 PyErr_Format(PyExc_TypeError,
8398 "character mapping must return integer, bytes or None, not %.400s",
8399 x->ob_type->tp_name);
8400 Py_DECREF(x);
8401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 }
8403}
8404
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008406charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008407{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008408 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8409 /* exponentially overallocate to minimize reallocations */
8410 if (requiredsize < 2*outsize)
8411 requiredsize = 2*outsize;
8412 if (_PyBytes_Resize(outobj, requiredsize))
8413 return -1;
8414 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415}
8416
Benjamin Peterson14339b62009-01-31 16:36:08 +00008417typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008421 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 space is available. Return a new reference to the object that
8423 was put in the output buffer, or Py_None, if the mapping was undefined
8424 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008425 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008427charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008428 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 PyObject *rep;
8431 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008432 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433
Christian Heimes90aa7642007-12-19 02:45:37 +00008434 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008435 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437 if (res == -1)
8438 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 if (outsize<requiredsize)
8440 if (charmapencode_resize(outobj, outpos, requiredsize))
8441 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008442 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 outstart[(*outpos)++] = (char)res;
8444 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445 }
8446
8447 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008450 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_DECREF(rep);
8452 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (PyLong_Check(rep)) {
8455 Py_ssize_t requiredsize = *outpos+1;
8456 if (outsize<requiredsize)
8457 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8458 Py_DECREF(rep);
8459 return enc_EXCEPTION;
8460 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008461 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 else {
8465 const char *repchars = PyBytes_AS_STRING(rep);
8466 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8467 Py_ssize_t requiredsize = *outpos+repsize;
8468 if (outsize<requiredsize)
8469 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8470 Py_DECREF(rep);
8471 return enc_EXCEPTION;
8472 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008473 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 memcpy(outstart + *outpos, repchars, repsize);
8475 *outpos += repsize;
8476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008478 Py_DECREF(rep);
8479 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480}
8481
8482/* handle an error in PyUnicode_EncodeCharmap
8483 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008484static int
8485charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008486 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008488 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008489 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490{
8491 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008492 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008493 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008494 enum PyUnicode_Kind kind;
8495 void *data;
8496 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008498 Py_ssize_t collstartpos = *inpos;
8499 Py_ssize_t collendpos = *inpos+1;
8500 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008501 const char *encoding = "charmap";
8502 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008503 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008504 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008505 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506
Benjamin Petersonbac79492012-01-14 13:34:47 -05008507 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008508 return -1;
8509 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 /* find all unencodable characters */
8511 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008512 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008513 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008514 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008515 val = encoding_map_lookup(ch, mapping);
8516 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 break;
8518 ++collendpos;
8519 continue;
8520 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008521
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008522 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8523 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 if (rep==NULL)
8525 return -1;
8526 else if (rep!=Py_None) {
8527 Py_DECREF(rep);
8528 break;
8529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 }
8533 /* cache callback name lookup
8534 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008535 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008536 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008537
8538 switch (*error_handler) {
8539 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008540 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008541 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008542
8543 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 x = charmapencode_output('?', mapping, res, respos);
8546 if (x==enc_EXCEPTION) {
8547 return -1;
8548 }
8549 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008550 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 return -1;
8552 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008553 }
8554 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008555 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 *inpos = collendpos;
8557 break;
Victor Stinner50149202015-09-22 00:26:54 +02008558
8559 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008560 /* generate replacement (temporarily (mis)uses p) */
8561 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 char buffer[2+29+1+1];
8563 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 for (cp = buffer; *cp; ++cp) {
8566 x = charmapencode_output(*cp, mapping, res, respos);
8567 if (x==enc_EXCEPTION)
8568 return -1;
8569 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008570 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 return -1;
8572 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008573 }
8574 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008575 *inpos = collendpos;
8576 break;
Victor Stinner50149202015-09-22 00:26:54 +02008577
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 default:
Victor Stinner50149202015-09-22 00:26:54 +02008579 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008582 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008584 if (PyBytes_Check(repunicode)) {
8585 /* Directly copy bytes result to output. */
8586 Py_ssize_t outsize = PyBytes_Size(*res);
8587 Py_ssize_t requiredsize;
8588 repsize = PyBytes_Size(repunicode);
8589 requiredsize = *respos + repsize;
8590 if (requiredsize > outsize)
8591 /* Make room for all additional bytes. */
8592 if (charmapencode_resize(res, respos, requiredsize)) {
8593 Py_DECREF(repunicode);
8594 return -1;
8595 }
8596 memcpy(PyBytes_AsString(*res) + *respos,
8597 PyBytes_AsString(repunicode), repsize);
8598 *respos += repsize;
8599 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008600 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008601 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008602 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008603 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008604 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008605 Py_DECREF(repunicode);
8606 return -1;
8607 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008608 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008609 data = PyUnicode_DATA(repunicode);
8610 kind = PyUnicode_KIND(repunicode);
8611 for (index = 0; index < repsize; index++) {
8612 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8613 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008615 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 return -1;
8617 }
8618 else if (x==enc_FAILED) {
8619 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008620 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return -1;
8622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008623 }
8624 *inpos = newpos;
8625 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 }
8627 return 0;
8628}
8629
Alexander Belopolsky40018472011-02-26 01:02:56 +00008630PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008631_PyUnicode_EncodeCharmap(PyObject *unicode,
8632 PyObject *mapping,
8633 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008635 /* output object */
8636 PyObject *res = NULL;
8637 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008638 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008639 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008641 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008642 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008644 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008645 void *data;
8646 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647
Benjamin Petersonbac79492012-01-14 13:34:47 -05008648 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008649 return NULL;
8650 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008651 data = PyUnicode_DATA(unicode);
8652 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008653
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654 /* Default to Latin-1 */
8655 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008656 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 /* allocate enough for a simple encoding without
8659 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008660 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 if (res == NULL)
8662 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008663 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008667 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008669 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 if (x==enc_EXCEPTION) /* error */
8671 goto onError;
8672 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008673 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008675 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 &res, &respos)) {
8677 goto onError;
8678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008679 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 else
8681 /* done with this character => adjust input position */
8682 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008686 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008687 if (_PyBytes_Resize(&res, respos) < 0)
8688 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008691 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 return res;
8693
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 Py_XDECREF(res);
8696 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008697 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 return NULL;
8699}
8700
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008701/* Deprecated */
8702PyObject *
8703PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8704 Py_ssize_t size,
8705 PyObject *mapping,
8706 const char *errors)
8707{
8708 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008709 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008710 if (unicode == NULL)
8711 return NULL;
8712 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8713 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008714 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008715}
8716
Alexander Belopolsky40018472011-02-26 01:02:56 +00008717PyObject *
8718PyUnicode_AsCharmapString(PyObject *unicode,
8719 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720{
8721 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 PyErr_BadArgument();
8723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008725 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008726}
8727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008729static void
8730make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008732 Py_ssize_t startpos, Py_ssize_t endpos,
8733 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 *exceptionObject = _PyUnicodeTranslateError_Create(
8737 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 }
8739 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8741 goto onError;
8742 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8743 goto onError;
8744 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8745 goto onError;
8746 return;
8747 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008748 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 }
8750}
8751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752/* error handling callback helper:
8753 build arguments, call the callback and check the arguments,
8754 put the result into newpos and return the replacement string, which
8755 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008756static PyObject *
8757unicode_translate_call_errorhandler(const char *errors,
8758 PyObject **errorHandler,
8759 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761 Py_ssize_t startpos, Py_ssize_t endpos,
8762 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008764 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008766 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 PyObject *restuple;
8768 PyObject *resunicode;
8769
8770 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008774 }
8775
8776 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008778 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008781 restuple = PyObject_CallFunctionObjArgs(
8782 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008786 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 Py_DECREF(restuple);
8788 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008790 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 &resunicode, &i_newpos)) {
8792 Py_DECREF(restuple);
8793 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008797 else
8798 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008800 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 Py_DECREF(restuple);
8802 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804 Py_INCREF(resunicode);
8805 Py_DECREF(restuple);
8806 return resunicode;
8807}
8808
8809/* Lookup the character ch in the mapping and put the result in result,
8810 which must be decrefed by the caller.
8811 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008812static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814{
Christian Heimes217cfd12007-12-02 14:31:20 +00008815 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816 PyObject *x;
8817
8818 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 x = PyObject_GetItem(mapping, w);
8821 Py_DECREF(w);
8822 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8824 /* No mapping found means: use 1:1 mapping. */
8825 PyErr_Clear();
8826 *result = NULL;
8827 return 0;
8828 } else
8829 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008830 }
8831 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 *result = x;
8833 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008834 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008835 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008837 if (value < 0 || value > MAX_UNICODE) {
8838 PyErr_Format(PyExc_ValueError,
8839 "character mapping must be in range(0x%x)",
8840 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 Py_DECREF(x);
8842 return -1;
8843 }
8844 *result = x;
8845 return 0;
8846 }
8847 else if (PyUnicode_Check(x)) {
8848 *result = x;
8849 return 0;
8850 }
8851 else {
8852 /* wrong return value */
8853 PyErr_SetString(PyExc_TypeError,
8854 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008855 Py_DECREF(x);
8856 return -1;
8857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008858}
Victor Stinner1194ea02014-04-04 19:37:40 +02008859
8860/* lookup the character, write the result into the writer.
8861 Return 1 if the result was written into the writer, return 0 if the mapping
8862 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008863static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008864charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8865 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866{
Victor Stinner1194ea02014-04-04 19:37:40 +02008867 PyObject *item;
8868
8869 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008871
8872 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008874 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008877 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008878 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008879
8880 if (item == Py_None) {
8881 Py_DECREF(item);
8882 return 0;
8883 }
8884
8885 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008886 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8887 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8888 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008889 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8890 Py_DECREF(item);
8891 return -1;
8892 }
8893 Py_DECREF(item);
8894 return 1;
8895 }
8896
8897 if (!PyUnicode_Check(item)) {
8898 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008900 }
8901
8902 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8903 Py_DECREF(item);
8904 return -1;
8905 }
8906
8907 Py_DECREF(item);
8908 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008909}
8910
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911static int
8912unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8913 Py_UCS1 *translate)
8914{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008915 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916 int ret = 0;
8917
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918 if (charmaptranslate_lookup(ch, mapping, &item)) {
8919 return -1;
8920 }
8921
8922 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008923 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008924 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008925 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008926 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008927 /* not found => default to 1:1 mapping */
8928 translate[ch] = ch;
8929 return 1;
8930 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008931 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008932 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008933 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8934 used it */
8935 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936 /* invalid character or character outside ASCII:
8937 skip the fast translate */
8938 goto exit;
8939 }
8940 translate[ch] = (Py_UCS1)replace;
8941 }
8942 else if (PyUnicode_Check(item)) {
8943 Py_UCS4 replace;
8944
8945 if (PyUnicode_READY(item) == -1) {
8946 Py_DECREF(item);
8947 return -1;
8948 }
8949 if (PyUnicode_GET_LENGTH(item) != 1)
8950 goto exit;
8951
8952 replace = PyUnicode_READ_CHAR(item, 0);
8953 if (replace > 127)
8954 goto exit;
8955 translate[ch] = (Py_UCS1)replace;
8956 }
8957 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008958 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008959 goto exit;
8960 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008961 ret = 1;
8962
Benjamin Peterson1365de72014-04-07 20:15:41 -04008963 exit:
8964 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 return ret;
8966}
8967
8968/* Fast path for ascii => ascii translation. Return 1 if the whole string
8969 was translated into writer, return 0 if the input string was partially
8970 translated into writer, raise an exception and return -1 on error. */
8971static int
8972unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008973 _PyUnicodeWriter *writer, int ignore,
8974 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975{
Victor Stinner872b2912014-04-05 14:27:07 +02008976 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977 Py_ssize_t len;
8978 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008979 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008980
Victor Stinner89a76ab2014-04-05 11:44:04 +02008981 len = PyUnicode_GET_LENGTH(input);
8982
Victor Stinner872b2912014-04-05 14:27:07 +02008983 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008984
8985 in = PyUnicode_1BYTE_DATA(input);
8986 end = in + len;
8987
8988 assert(PyUnicode_IS_ASCII(writer->buffer));
8989 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8990 out = PyUnicode_1BYTE_DATA(writer->buffer);
8991
Victor Stinner872b2912014-04-05 14:27:07 +02008992 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008993 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008994 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008995 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008996 int translate = unicode_fast_translate_lookup(mapping, ch,
8997 ascii_table);
8998 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008999 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009000 if (translate == 0)
9001 goto exit;
9002 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009003 }
Victor Stinner872b2912014-04-05 14:27:07 +02009004 if (ch2 == 0xfe) {
9005 if (ignore)
9006 continue;
9007 goto exit;
9008 }
9009 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009010 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009011 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 }
Victor Stinner872b2912014-04-05 14:27:07 +02009013 res = 1;
9014
9015exit:
9016 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009017 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009018 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009019}
9020
Victor Stinner3222da22015-10-01 22:07:32 +02009021static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022_PyUnicode_TranslateCharmap(PyObject *input,
9023 PyObject *mapping,
9024 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009027 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 Py_ssize_t size, i;
9029 int kind;
9030 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 _PyUnicodeWriter writer;
9032 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009033 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009034 PyObject *errorHandler = NULL;
9035 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009036 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009037 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 PyErr_BadArgument();
9041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 if (PyUnicode_READY(input) == -1)
9045 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009046 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 kind = PyUnicode_KIND(input);
9048 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009050 if (size == 0)
9051 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009053 /* allocate enough for a simple 1:1 translation without
9054 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009055 _PyUnicodeWriter_Init(&writer);
9056 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058
Victor Stinner872b2912014-04-05 14:27:07 +02009059 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9060
Victor Stinner33798672016-03-01 21:59:58 +01009061 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009063 if (PyUnicode_IS_ASCII(input)) {
9064 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9065 if (res < 0) {
9066 _PyUnicodeWriter_Dealloc(&writer);
9067 return NULL;
9068 }
9069 if (res == 1)
9070 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009071 }
Victor Stinner33798672016-03-01 21:59:58 +01009072 else {
9073 i = 0;
9074 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009078 int translate;
9079 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9080 Py_ssize_t newpos;
9081 /* startpos for collecting untranslatable chars */
9082 Py_ssize_t collstart;
9083 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009084 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085
Victor Stinner1194ea02014-04-04 19:37:40 +02009086 ch = PyUnicode_READ(kind, data, i);
9087 translate = charmaptranslate_output(ch, mapping, &writer);
9088 if (translate < 0)
9089 goto onError;
9090
9091 if (translate != 0) {
9092 /* it worked => adjust input pointer */
9093 ++i;
9094 continue;
9095 }
9096
9097 /* untranslatable character */
9098 collstart = i;
9099 collend = i+1;
9100
9101 /* find all untranslatable characters */
9102 while (collend < size) {
9103 PyObject *x;
9104 ch = PyUnicode_READ(kind, data, collend);
9105 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009106 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009107 Py_XDECREF(x);
9108 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009109 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009110 ++collend;
9111 }
9112
9113 if (ignore) {
9114 i = collend;
9115 }
9116 else {
9117 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9118 reason, input, &exc,
9119 collstart, collend, &newpos);
9120 if (repunicode == NULL)
9121 goto onError;
9122 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009124 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009125 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009126 Py_DECREF(repunicode);
9127 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009128 }
9129 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009130 Py_XDECREF(exc);
9131 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009132 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009135 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009136 Py_XDECREF(exc);
9137 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138 return NULL;
9139}
9140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141/* Deprecated. Use PyUnicode_Translate instead. */
9142PyObject *
9143PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9144 Py_ssize_t size,
9145 PyObject *mapping,
9146 const char *errors)
9147{
Christian Heimes5f520f42012-09-11 14:03:25 +02009148 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009149 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 if (!unicode)
9151 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009152 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9153 Py_DECREF(unicode);
9154 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155}
9156
Alexander Belopolsky40018472011-02-26 01:02:56 +00009157PyObject *
9158PyUnicode_Translate(PyObject *str,
9159 PyObject *mapping,
9160 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009162 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009163 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009164 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165}
Tim Petersced69f82003-09-16 20:30:58 +00009166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167PyObject *
9168_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9169{
9170 if (!PyUnicode_Check(unicode)) {
9171 PyErr_BadInternalCall();
9172 return NULL;
9173 }
9174 if (PyUnicode_READY(unicode) == -1)
9175 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009176 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 /* If the string is already ASCII, just return the same string */
9178 Py_INCREF(unicode);
9179 return unicode;
9180 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009181
9182 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9183 PyObject *result = PyUnicode_New(len, 127);
9184 if (result == NULL) {
9185 return NULL;
9186 }
9187
9188 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9189 int kind = PyUnicode_KIND(unicode);
9190 const void *data = PyUnicode_DATA(unicode);
9191 Py_ssize_t i;
9192 for (i = 0; i < len; ++i) {
9193 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9194 if (ch < 127) {
9195 out[i] = ch;
9196 }
9197 else if (Py_UNICODE_ISSPACE(ch)) {
9198 out[i] = ' ';
9199 }
9200 else {
9201 int decimal = Py_UNICODE_TODECIMAL(ch);
9202 if (decimal < 0) {
9203 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009204 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009205 _PyUnicode_LENGTH(result) = i + 1;
9206 break;
9207 }
9208 out[i] = '0' + decimal;
9209 }
9210 }
9211
INADA Naoki16dfca42018-07-14 12:06:43 +09009212 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009213 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214}
9215
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009216PyObject *
9217PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9218 Py_ssize_t length)
9219{
Victor Stinnerf0124502011-11-21 23:12:56 +01009220 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009221 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009222 Py_UCS4 maxchar;
9223 enum PyUnicode_Kind kind;
9224 void *data;
9225
Victor Stinner99d7ad02012-02-22 13:37:39 +01009226 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009227 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009228 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009229 if (ch > 127) {
9230 int decimal = Py_UNICODE_TODECIMAL(ch);
9231 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009232 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009233 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009234 }
9235 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009236
9237 /* Copy to a new string */
9238 decimal = PyUnicode_New(length, maxchar);
9239 if (decimal == NULL)
9240 return decimal;
9241 kind = PyUnicode_KIND(decimal);
9242 data = PyUnicode_DATA(decimal);
9243 /* Iterate over code points */
9244 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009245 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009246 if (ch > 127) {
9247 int decimal = Py_UNICODE_TODECIMAL(ch);
9248 if (decimal >= 0)
9249 ch = '0' + decimal;
9250 }
9251 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009253 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009254}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009255/* --- Decimal Encoder ---------------------------------------------------- */
9256
Alexander Belopolsky40018472011-02-26 01:02:56 +00009257int
9258PyUnicode_EncodeDecimal(Py_UNICODE *s,
9259 Py_ssize_t length,
9260 char *output,
9261 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009262{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009263 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009264 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009265 enum PyUnicode_Kind kind;
9266 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009267
9268 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009269 PyErr_BadArgument();
9270 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009271 }
9272
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009273 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009274 if (unicode == NULL)
9275 return -1;
9276
Victor Stinner42bf7752011-11-21 22:52:58 +01009277 kind = PyUnicode_KIND(unicode);
9278 data = PyUnicode_DATA(unicode);
9279
Victor Stinnerb84d7232011-11-22 01:50:07 +01009280 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009281 PyObject *exc;
9282 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009283 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009284 Py_ssize_t startpos;
9285
9286 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009287
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009289 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009290 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 decimal = Py_UNICODE_TODECIMAL(ch);
9294 if (decimal >= 0) {
9295 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009296 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 continue;
9298 }
9299 if (0 < ch && ch < 256) {
9300 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009301 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009302 continue;
9303 }
Victor Stinner6345be92011-11-25 20:09:01 +01009304
Victor Stinner42bf7752011-11-21 22:52:58 +01009305 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009306 exc = NULL;
9307 raise_encode_exception(&exc, "decimal", unicode,
9308 startpos, startpos+1,
9309 "invalid decimal Unicode string");
9310 Py_XDECREF(exc);
9311 Py_DECREF(unicode);
9312 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009313 }
9314 /* 0-terminate the output string */
9315 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009316 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009317 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009318}
9319
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320/* --- Helpers ------------------------------------------------------------ */
9321
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009322/* helper macro to fixup start/end slice values */
9323#define ADJUST_INDICES(start, end, len) \
9324 if (end > len) \
9325 end = len; \
9326 else if (end < 0) { \
9327 end += len; \
9328 if (end < 0) \
9329 end = 0; \
9330 } \
9331 if (start < 0) { \
9332 start += len; \
9333 if (start < 0) \
9334 start = 0; \
9335 }
9336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009338any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009340 Py_ssize_t end,
9341 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009343 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 void *buf1, *buf2;
9345 Py_ssize_t len1, len2, result;
9346
9347 kind1 = PyUnicode_KIND(s1);
9348 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009349 if (kind1 < kind2)
9350 return -1;
9351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 len1 = PyUnicode_GET_LENGTH(s1);
9353 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009354 ADJUST_INDICES(start, end, len1);
9355 if (end - start < len2)
9356 return -1;
9357
9358 buf1 = PyUnicode_DATA(s1);
9359 buf2 = PyUnicode_DATA(s2);
9360 if (len2 == 1) {
9361 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9362 result = findchar((const char *)buf1 + kind1*start,
9363 kind1, end - start, ch, direction);
9364 if (result == -1)
9365 return -1;
9366 else
9367 return start + result;
9368 }
9369
9370 if (kind2 != kind1) {
9371 buf2 = _PyUnicode_AsKind(s2, kind1);
9372 if (!buf2)
9373 return -2;
9374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375
Victor Stinner794d5672011-10-10 03:21:36 +02009376 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009377 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009378 case PyUnicode_1BYTE_KIND:
9379 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9380 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9381 else
9382 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9383 break;
9384 case PyUnicode_2BYTE_KIND:
9385 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9386 break;
9387 case PyUnicode_4BYTE_KIND:
9388 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9389 break;
9390 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009391 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009392 }
9393 }
9394 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009395 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009396 case PyUnicode_1BYTE_KIND:
9397 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9398 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9399 else
9400 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9401 break;
9402 case PyUnicode_2BYTE_KIND:
9403 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9404 break;
9405 case PyUnicode_4BYTE_KIND:
9406 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9407 break;
9408 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009409 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 }
9412
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009413 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 PyMem_Free(buf2);
9415
9416 return result;
9417}
9418
Victor Stinner59423e32018-11-26 13:40:01 +01009419/* _PyUnicode_InsertThousandsGrouping() helper functions */
9420#include "stringlib/localeutil.h"
9421
9422/**
9423 * InsertThousandsGrouping:
9424 * @writer: Unicode writer.
9425 * @n_buffer: Number of characters in @buffer.
9426 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9427 * @d_pos: Start of digits string.
9428 * @n_digits: The number of digits in the string, in which we want
9429 * to put the grouping chars.
9430 * @min_width: The minimum width of the digits in the output string.
9431 * Output will be zero-padded on the left to fill.
9432 * @grouping: see definition in localeconv().
9433 * @thousands_sep: see definition in localeconv().
9434 *
9435 * There are 2 modes: counting and filling. If @writer is NULL,
9436 * we are in counting mode, else filling mode.
9437 * If counting, the required buffer size is returned.
9438 * If filling, we know the buffer will be large enough, so we don't
9439 * need to pass in the buffer size.
9440 * Inserts thousand grouping characters (as defined by grouping and
9441 * thousands_sep) into @writer.
9442 *
9443 * Return value: -1 on error, number of characters otherwise.
9444 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009446_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009447 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009448 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009449 PyObject *digits,
9450 Py_ssize_t d_pos,
9451 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009452 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009453 const char *grouping,
9454 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009455 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456{
Xtreak3f7983a2019-01-07 20:39:14 +05309457 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009458 if (writer) {
9459 assert(digits != NULL);
9460 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009461 }
9462 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009463 assert(digits == NULL);
9464 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009465 }
Victor Stinner59423e32018-11-26 13:40:01 +01009466 assert(0 <= d_pos);
9467 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009468 assert(grouping != NULL);
9469
9470 if (digits != NULL) {
9471 if (PyUnicode_READY(digits) == -1) {
9472 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009473 }
Victor Stinner59423e32018-11-26 13:40:01 +01009474 }
9475 if (PyUnicode_READY(thousands_sep) == -1) {
9476 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009477 }
9478
Victor Stinner59423e32018-11-26 13:40:01 +01009479 Py_ssize_t count = 0;
9480 Py_ssize_t n_zeros;
9481 int loop_broken = 0;
9482 int use_separator = 0; /* First time through, don't append the
9483 separator. They only go between
9484 groups. */
9485 Py_ssize_t buffer_pos;
9486 Py_ssize_t digits_pos;
9487 Py_ssize_t len;
9488 Py_ssize_t n_chars;
9489 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9490 be looked at */
9491 /* A generator that returns all of the grouping widths, until it
9492 returns 0. */
9493 GroupGenerator groupgen;
9494 GroupGenerator_init(&groupgen, grouping);
9495 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9496
9497 /* if digits are not grouped, thousands separator
9498 should be an empty string */
9499 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9500
9501 digits_pos = d_pos + n_digits;
9502 if (writer) {
9503 buffer_pos = writer->pos + n_buffer;
9504 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9505 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 }
Victor Stinner59423e32018-11-26 13:40:01 +01009507 else {
9508 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009509 }
Victor Stinner59423e32018-11-26 13:40:01 +01009510
9511 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009512 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009513 }
Victor Stinner59423e32018-11-26 13:40:01 +01009514
9515 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9516 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9517 n_zeros = Py_MAX(0, len - remaining);
9518 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9519
9520 /* Use n_zero zero's and n_chars chars */
9521
9522 /* Count only, don't do anything. */
9523 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9524
9525 /* Copy into the writer. */
9526 InsertThousandsGrouping_fill(writer, &buffer_pos,
9527 digits, &digits_pos,
9528 n_chars, n_zeros,
9529 use_separator ? thousands_sep : NULL,
9530 thousands_sep_len, maxchar);
9531
9532 /* Use a separator next time. */
9533 use_separator = 1;
9534
9535 remaining -= n_chars;
9536 min_width -= len;
9537
9538 if (remaining <= 0 && min_width <= 0) {
9539 loop_broken = 1;
9540 break;
9541 }
9542 min_width -= thousands_sep_len;
9543 }
9544 if (!loop_broken) {
9545 /* We left the loop without using a break statement. */
9546
9547 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9548 n_zeros = Py_MAX(0, len - remaining);
9549 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9550
9551 /* Use n_zero zero's and n_chars chars */
9552 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9553
9554 /* Copy into the writer. */
9555 InsertThousandsGrouping_fill(writer, &buffer_pos,
9556 digits, &digits_pos,
9557 n_chars, n_zeros,
9558 use_separator ? thousands_sep : NULL,
9559 thousands_sep_len, maxchar);
9560 }
9561 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562}
9563
9564
Alexander Belopolsky40018472011-02-26 01:02:56 +00009565Py_ssize_t
9566PyUnicode_Count(PyObject *str,
9567 PyObject *substr,
9568 Py_ssize_t start,
9569 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009571 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009572 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 void *buf1 = NULL, *buf2 = NULL;
9574 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009575
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009576 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009578
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009579 kind1 = PyUnicode_KIND(str);
9580 kind2 = PyUnicode_KIND(substr);
9581 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009582 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009583
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009584 len1 = PyUnicode_GET_LENGTH(str);
9585 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009587 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009588 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009589
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009590 buf1 = PyUnicode_DATA(str);
9591 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009594 if (!buf2)
9595 goto onError;
9596 }
9597
9598 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009600 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009601 result = asciilib_count(
9602 ((Py_UCS1*)buf1) + start, end - start,
9603 buf2, len2, PY_SSIZE_T_MAX
9604 );
9605 else
9606 result = ucs1lib_count(
9607 ((Py_UCS1*)buf1) + start, end - start,
9608 buf2, len2, PY_SSIZE_T_MAX
9609 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 break;
9611 case PyUnicode_2BYTE_KIND:
9612 result = ucs2lib_count(
9613 ((Py_UCS2*)buf1) + start, end - start,
9614 buf2, len2, PY_SSIZE_T_MAX
9615 );
9616 break;
9617 case PyUnicode_4BYTE_KIND:
9618 result = ucs4lib_count(
9619 ((Py_UCS4*)buf1) + start, end - start,
9620 buf2, len2, PY_SSIZE_T_MAX
9621 );
9622 break;
9623 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009624 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009626
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009627 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 PyMem_Free(buf2);
9629
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009632 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 PyMem_Free(buf2);
9634 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635}
9636
Alexander Belopolsky40018472011-02-26 01:02:56 +00009637Py_ssize_t
9638PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009639 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009640 Py_ssize_t start,
9641 Py_ssize_t end,
9642 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009644 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009646
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009647 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650Py_ssize_t
9651PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9652 Py_ssize_t start, Py_ssize_t end,
9653 int direction)
9654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009656 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 if (PyUnicode_READY(str) == -1)
9658 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009659 len = PyUnicode_GET_LENGTH(str);
9660 ADJUST_INDICES(start, end, len);
9661 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009662 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009664 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9665 kind, end-start, ch, direction);
9666 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009668 else
9669 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670}
9671
Alexander Belopolsky40018472011-02-26 01:02:56 +00009672static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009673tailmatch(PyObject *self,
9674 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009675 Py_ssize_t start,
9676 Py_ssize_t end,
9677 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 int kind_self;
9680 int kind_sub;
9681 void *data_self;
9682 void *data_sub;
9683 Py_ssize_t offset;
9684 Py_ssize_t i;
9685 Py_ssize_t end_sub;
9686
9687 if (PyUnicode_READY(self) == -1 ||
9688 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009689 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9692 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009696 if (PyUnicode_GET_LENGTH(substring) == 0)
9697 return 1;
9698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 kind_self = PyUnicode_KIND(self);
9700 data_self = PyUnicode_DATA(self);
9701 kind_sub = PyUnicode_KIND(substring);
9702 data_sub = PyUnicode_DATA(substring);
9703 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9704
9705 if (direction > 0)
9706 offset = end;
9707 else
9708 offset = start;
9709
9710 if (PyUnicode_READ(kind_self, data_self, offset) ==
9711 PyUnicode_READ(kind_sub, data_sub, 0) &&
9712 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9713 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9714 /* If both are of the same kind, memcmp is sufficient */
9715 if (kind_self == kind_sub) {
9716 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009717 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 data_sub,
9719 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009720 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009722 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 else {
9724 /* We do not need to compare 0 and len(substring)-1 because
9725 the if statement above ensured already that they are equal
9726 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 for (i = 1; i < end_sub; ++i) {
9728 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9729 PyUnicode_READ(kind_sub, data_sub, i))
9730 return 0;
9731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 }
9735
9736 return 0;
9737}
9738
Alexander Belopolsky40018472011-02-26 01:02:56 +00009739Py_ssize_t
9740PyUnicode_Tailmatch(PyObject *str,
9741 PyObject *substr,
9742 Py_ssize_t start,
9743 Py_ssize_t end,
9744 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009746 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009747 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009748
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009749 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750}
9751
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752static PyObject *
9753ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9756 char *resdata, *data = PyUnicode_DATA(self);
9757 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009758
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 res = PyUnicode_New(len, 127);
9760 if (res == NULL)
9761 return NULL;
9762 resdata = PyUnicode_DATA(res);
9763 if (lower)
9764 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766 _Py_bytes_upper(resdata, data, len);
9767 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768}
9769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773 Py_ssize_t j;
9774 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009775 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009777
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009778 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9779
9780 where ! is a negation and \p{xxx} is a character with property xxx.
9781 */
9782 for (j = i - 1; j >= 0; j--) {
9783 c = PyUnicode_READ(kind, data, j);
9784 if (!_PyUnicode_IsCaseIgnorable(c))
9785 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009787 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9788 if (final_sigma) {
9789 for (j = i + 1; j < length; j++) {
9790 c = PyUnicode_READ(kind, data, j);
9791 if (!_PyUnicode_IsCaseIgnorable(c))
9792 break;
9793 }
9794 final_sigma = j == length || !_PyUnicode_IsCased(c);
9795 }
9796 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009799static int
9800lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9801 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009803 /* Obscure special case. */
9804 if (c == 0x3A3) {
9805 mapped[0] = handle_capital_sigma(kind, data, length, i);
9806 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809}
9810
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009811static Py_ssize_t
9812do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009814 Py_ssize_t i, k = 0;
9815 int n_res, j;
9816 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009817
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009819 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009821 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824 for (i = 1; i < length; i++) {
9825 c = PyUnicode_READ(kind, data, i);
9826 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9827 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009828 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009829 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009830 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009831 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009832 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833}
9834
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009835static Py_ssize_t
9836do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9837 Py_ssize_t i, k = 0;
9838
9839 for (i = 0; i < length; i++) {
9840 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9841 int n_res, j;
9842 if (Py_UNICODE_ISUPPER(c)) {
9843 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9844 }
9845 else if (Py_UNICODE_ISLOWER(c)) {
9846 n_res = _PyUnicode_ToUpperFull(c, mapped);
9847 }
9848 else {
9849 n_res = 1;
9850 mapped[0] = c;
9851 }
9852 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009853 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009854 res[k++] = mapped[j];
9855 }
9856 }
9857 return k;
9858}
9859
9860static Py_ssize_t
9861do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9862 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 Py_ssize_t i, k = 0;
9865
9866 for (i = 0; i < length; i++) {
9867 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9868 int n_res, j;
9869 if (lower)
9870 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9871 else
9872 n_res = _PyUnicode_ToUpperFull(c, mapped);
9873 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009874 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875 res[k++] = mapped[j];
9876 }
9877 }
9878 return k;
9879}
9880
9881static Py_ssize_t
9882do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9883{
9884 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9885}
9886
9887static Py_ssize_t
9888do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9889{
9890 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9891}
9892
Benjamin Petersone51757f2012-01-12 21:10:29 -05009893static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009894do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9895{
9896 Py_ssize_t i, k = 0;
9897
9898 for (i = 0; i < length; i++) {
9899 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9900 Py_UCS4 mapped[3];
9901 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9902 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009903 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009904 res[k++] = mapped[j];
9905 }
9906 }
9907 return k;
9908}
9909
9910static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009911do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9912{
9913 Py_ssize_t i, k = 0;
9914 int previous_is_cased;
9915
9916 previous_is_cased = 0;
9917 for (i = 0; i < length; i++) {
9918 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9919 Py_UCS4 mapped[3];
9920 int n_res, j;
9921
9922 if (previous_is_cased)
9923 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924 else
9925 n_res = _PyUnicode_ToTitleFull(c, mapped);
9926
9927 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009928 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009929 res[k++] = mapped[j];
9930 }
9931
9932 previous_is_cased = _PyUnicode_IsCased(c);
9933 }
9934 return k;
9935}
9936
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009937static PyObject *
9938case_operation(PyObject *self,
9939 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9940{
9941 PyObject *res = NULL;
9942 Py_ssize_t length, newlength = 0;
9943 int kind, outkind;
9944 void *data, *outdata;
9945 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9946
Benjamin Petersoneea48462012-01-16 14:28:50 -05009947 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009948
9949 kind = PyUnicode_KIND(self);
9950 data = PyUnicode_DATA(self);
9951 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009952 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009953 PyErr_SetString(PyExc_OverflowError, "string is too long");
9954 return NULL;
9955 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009956 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009957 if (tmp == NULL)
9958 return PyErr_NoMemory();
9959 newlength = perform(kind, data, length, tmp, &maxchar);
9960 res = PyUnicode_New(newlength, maxchar);
9961 if (res == NULL)
9962 goto leave;
9963 tmpend = tmp + newlength;
9964 outdata = PyUnicode_DATA(res);
9965 outkind = PyUnicode_KIND(res);
9966 switch (outkind) {
9967 case PyUnicode_1BYTE_KIND:
9968 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9969 break;
9970 case PyUnicode_2BYTE_KIND:
9971 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9972 break;
9973 case PyUnicode_4BYTE_KIND:
9974 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9975 break;
9976 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009977 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009978 }
9979 leave:
9980 PyMem_FREE(tmp);
9981 return res;
9982}
9983
Tim Peters8ce9f162004-08-27 01:49:32 +00009984PyObject *
9985PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009987 PyObject *res;
9988 PyObject *fseq;
9989 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009990 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009992 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009993 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009994 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009995 }
9996
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009997 /* NOTE: the following code can't call back into Python code,
9998 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009999 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010001 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010002 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010003 res = _PyUnicode_JoinArray(separator, items, seqlen);
10004 Py_DECREF(fseq);
10005 return res;
10006}
10007
10008PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010009_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010010{
10011 PyObject *res = NULL; /* the result */
10012 PyObject *sep = NULL;
10013 Py_ssize_t seplen;
10014 PyObject *item;
10015 Py_ssize_t sz, i, res_offset;
10016 Py_UCS4 maxchar;
10017 Py_UCS4 item_maxchar;
10018 int use_memcpy;
10019 unsigned char *res_data = NULL, *sep_data = NULL;
10020 PyObject *last_obj;
10021 unsigned int kind = 0;
10022
Tim Peters05eba1f2004-08-27 21:32:02 +000010023 /* If empty sequence, return u"". */
10024 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010025 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010026 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010027
Tim Peters05eba1f2004-08-27 21:32:02 +000010028 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010029 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010030 if (seqlen == 1) {
10031 if (PyUnicode_CheckExact(items[0])) {
10032 res = items[0];
10033 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010034 return res;
10035 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010036 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010037 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010038 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010039 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010040 /* Set up sep and seplen */
10041 if (separator == NULL) {
10042 /* fall back to a blank space separator */
10043 sep = PyUnicode_FromOrdinal(' ');
10044 if (!sep)
10045 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010047 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010048 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010049 else {
10050 if (!PyUnicode_Check(separator)) {
10051 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010052 "separator: expected str instance,"
10053 " %.80s found",
10054 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010055 goto onError;
10056 }
10057 if (PyUnicode_READY(separator))
10058 goto onError;
10059 sep = separator;
10060 seplen = PyUnicode_GET_LENGTH(separator);
10061 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10062 /* inc refcount to keep this code path symmetric with the
10063 above case of a blank separator */
10064 Py_INCREF(sep);
10065 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010066 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010067 }
10068
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010069 /* There are at least two things to join, or else we have a subclass
10070 * of str in the sequence.
10071 * Do a pre-pass to figure out the total amount of space we'll
10072 * need (sz), and see whether all argument are strings.
10073 */
10074 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010075#ifdef Py_DEBUG
10076 use_memcpy = 0;
10077#else
10078 use_memcpy = 1;
10079#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010080 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010081 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010082 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 if (!PyUnicode_Check(item)) {
10084 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010085 "sequence item %zd: expected str instance,"
10086 " %.80s found",
10087 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010088 goto onError;
10089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 if (PyUnicode_READY(item) == -1)
10091 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010092 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010094 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010095 if (i != 0) {
10096 add_sz += seplen;
10097 }
10098 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010099 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010100 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010101 goto onError;
10102 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010103 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010104 if (use_memcpy && last_obj != NULL) {
10105 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10106 use_memcpy = 0;
10107 }
10108 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010109 }
Tim Petersced69f82003-09-16 20:30:58 +000010110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010112 if (res == NULL)
10113 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010114
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010115 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010116#ifdef Py_DEBUG
10117 use_memcpy = 0;
10118#else
10119 if (use_memcpy) {
10120 res_data = PyUnicode_1BYTE_DATA(res);
10121 kind = PyUnicode_KIND(res);
10122 if (seplen != 0)
10123 sep_data = PyUnicode_1BYTE_DATA(sep);
10124 }
10125#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010126 if (use_memcpy) {
10127 for (i = 0; i < seqlen; ++i) {
10128 Py_ssize_t itemlen;
10129 item = items[i];
10130
10131 /* Copy item, and maybe the separator. */
10132 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010133 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010134 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010135 kind * seplen);
10136 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010137 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010138
10139 itemlen = PyUnicode_GET_LENGTH(item);
10140 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010141 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010142 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010143 kind * itemlen);
10144 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010145 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010146 }
10147 assert(res_data == PyUnicode_1BYTE_DATA(res)
10148 + kind * PyUnicode_GET_LENGTH(res));
10149 }
10150 else {
10151 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10152 Py_ssize_t itemlen;
10153 item = items[i];
10154
10155 /* Copy item, and maybe the separator. */
10156 if (i && seplen != 0) {
10157 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10158 res_offset += seplen;
10159 }
10160
10161 itemlen = PyUnicode_GET_LENGTH(item);
10162 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010163 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010164 res_offset += itemlen;
10165 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010166 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010167 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010168 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010171 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010176 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177 return NULL;
10178}
10179
Victor Stinnerd3f08822012-05-29 12:57:52 +020010180void
10181_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10182 Py_UCS4 fill_char)
10183{
10184 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010185 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010186 assert(PyUnicode_IS_READY(unicode));
10187 assert(unicode_modifiable(unicode));
10188 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10189 assert(start >= 0);
10190 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010191 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010192}
10193
Victor Stinner3fe55312012-01-04 00:33:50 +010010194Py_ssize_t
10195PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10196 Py_UCS4 fill_char)
10197{
10198 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010199
10200 if (!PyUnicode_Check(unicode)) {
10201 PyErr_BadInternalCall();
10202 return -1;
10203 }
10204 if (PyUnicode_READY(unicode) == -1)
10205 return -1;
10206 if (unicode_check_modifiable(unicode))
10207 return -1;
10208
Victor Stinnerd3f08822012-05-29 12:57:52 +020010209 if (start < 0) {
10210 PyErr_SetString(PyExc_IndexError, "string index out of range");
10211 return -1;
10212 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010213 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10214 PyErr_SetString(PyExc_ValueError,
10215 "fill character is bigger than "
10216 "the string maximum character");
10217 return -1;
10218 }
10219
10220 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10221 length = Py_MIN(maxlen, length);
10222 if (length <= 0)
10223 return 0;
10224
Victor Stinnerd3f08822012-05-29 12:57:52 +020010225 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010226 return length;
10227}
10228
Victor Stinner9310abb2011-10-05 00:59:23 +020010229static PyObject *
10230pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010231 Py_ssize_t left,
10232 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 PyObject *u;
10236 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010237 int kind;
10238 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
10240 if (left < 0)
10241 left = 0;
10242 if (right < 0)
10243 right = 0;
10244
Victor Stinnerc4b49542011-12-11 22:44:26 +010010245 if (left == 0 && right == 0)
10246 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10249 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010250 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10251 return NULL;
10252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010254 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010256 if (!u)
10257 return NULL;
10258
10259 kind = PyUnicode_KIND(u);
10260 data = PyUnicode_DATA(u);
10261 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010262 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010263 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010264 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010265 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010266 assert(_PyUnicode_CheckConsistency(u, 1));
10267 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268}
10269
Alexander Belopolsky40018472011-02-26 01:02:56 +000010270PyObject *
10271PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010275 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277
Benjamin Petersonead6b532011-12-20 17:23:42 -060010278 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 if (PyUnicode_IS_ASCII(string))
10281 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 PyUnicode_GET_LENGTH(string), keepends);
10284 else
10285 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 break;
10289 case PyUnicode_2BYTE_KIND:
10290 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 PyUnicode_GET_LENGTH(string), keepends);
10293 break;
10294 case PyUnicode_4BYTE_KIND:
10295 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010296 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 PyUnicode_GET_LENGTH(string), keepends);
10298 break;
10299 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010300 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Alexander Belopolsky40018472011-02-26 01:02:56 +000010305static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010306split(PyObject *self,
10307 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010310 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 void *buf1, *buf2;
10312 Py_ssize_t len1, len2;
10313 PyObject* out;
10314
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010316 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (PyUnicode_READY(self) == -1)
10319 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010322 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 if (PyUnicode_IS_ASCII(self))
10325 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010326 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 PyUnicode_GET_LENGTH(self), maxcount
10328 );
10329 else
10330 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(self), maxcount
10333 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 case PyUnicode_2BYTE_KIND:
10335 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 PyUnicode_GET_LENGTH(self), maxcount
10338 );
10339 case PyUnicode_4BYTE_KIND:
10340 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(self), maxcount
10343 );
10344 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010345 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 }
10347
10348 if (PyUnicode_READY(substring) == -1)
10349 return NULL;
10350
10351 kind1 = PyUnicode_KIND(self);
10352 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 len1 = PyUnicode_GET_LENGTH(self);
10354 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010355 if (kind1 < kind2 || len1 < len2) {
10356 out = PyList_New(1);
10357 if (out == NULL)
10358 return NULL;
10359 Py_INCREF(self);
10360 PyList_SET_ITEM(out, 0, self);
10361 return out;
10362 }
10363 buf1 = PyUnicode_DATA(self);
10364 buf2 = PyUnicode_DATA(substring);
10365 if (kind2 != kind1) {
10366 buf2 = _PyUnicode_AsKind(substring, kind1);
10367 if (!buf2)
10368 return NULL;
10369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010371 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10374 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010376 else
10377 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010378 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 break;
10380 case PyUnicode_2BYTE_KIND:
10381 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_4BYTE_KIND:
10385 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 default:
10389 out = NULL;
10390 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010391 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 PyMem_Free(buf2);
10393 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394}
10395
Alexander Belopolsky40018472011-02-26 01:02:56 +000010396static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010397rsplit(PyObject *self,
10398 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010399 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010400{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010401 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 void *buf1, *buf2;
10403 Py_ssize_t len1, len2;
10404 PyObject* out;
10405
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010406 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010407 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 if (PyUnicode_READY(self) == -1)
10410 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010413 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010415 if (PyUnicode_IS_ASCII(self))
10416 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010417 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010418 PyUnicode_GET_LENGTH(self), maxcount
10419 );
10420 else
10421 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010422 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423 PyUnicode_GET_LENGTH(self), maxcount
10424 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 case PyUnicode_2BYTE_KIND:
10426 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010427 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 PyUnicode_GET_LENGTH(self), maxcount
10429 );
10430 case PyUnicode_4BYTE_KIND:
10431 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010432 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 PyUnicode_GET_LENGTH(self), maxcount
10434 );
10435 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010436 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 }
10438
10439 if (PyUnicode_READY(substring) == -1)
10440 return NULL;
10441
10442 kind1 = PyUnicode_KIND(self);
10443 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 len1 = PyUnicode_GET_LENGTH(self);
10445 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010446 if (kind1 < kind2 || len1 < len2) {
10447 out = PyList_New(1);
10448 if (out == NULL)
10449 return NULL;
10450 Py_INCREF(self);
10451 PyList_SET_ITEM(out, 0, self);
10452 return out;
10453 }
10454 buf1 = PyUnicode_DATA(self);
10455 buf2 = PyUnicode_DATA(substring);
10456 if (kind2 != kind1) {
10457 buf2 = _PyUnicode_AsKind(substring, kind1);
10458 if (!buf2)
10459 return NULL;
10460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010462 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010464 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10465 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010466 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010467 else
10468 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010469 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 break;
10471 case PyUnicode_2BYTE_KIND:
10472 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010473 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 break;
10475 case PyUnicode_4BYTE_KIND:
10476 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 break;
10479 default:
10480 out = NULL;
10481 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010482 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 PyMem_Free(buf2);
10484 return out;
10485}
10486
10487static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010488anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10489 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010491 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010493 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10494 return asciilib_find(buf1, len1, buf2, len2, offset);
10495 else
10496 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 case PyUnicode_2BYTE_KIND:
10498 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10499 case PyUnicode_4BYTE_KIND:
10500 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10501 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010502 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503}
10504
10505static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010506anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10507 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010509 switch (kind) {
10510 case PyUnicode_1BYTE_KIND:
10511 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10512 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10513 else
10514 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10515 case PyUnicode_2BYTE_KIND:
10516 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10517 case PyUnicode_4BYTE_KIND:
10518 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10519 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010520 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010521}
10522
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010523static void
10524replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10525 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10526{
10527 int kind = PyUnicode_KIND(u);
10528 void *data = PyUnicode_DATA(u);
10529 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10530 if (kind == PyUnicode_1BYTE_KIND) {
10531 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10532 (Py_UCS1 *)data + len,
10533 u1, u2, maxcount);
10534 }
10535 else if (kind == PyUnicode_2BYTE_KIND) {
10536 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10537 (Py_UCS2 *)data + len,
10538 u1, u2, maxcount);
10539 }
10540 else {
10541 assert(kind == PyUnicode_4BYTE_KIND);
10542 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10543 (Py_UCS4 *)data + len,
10544 u1, u2, maxcount);
10545 }
10546}
10547
Alexander Belopolsky40018472011-02-26 01:02:56 +000010548static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549replace(PyObject *self, PyObject *str1,
10550 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 PyObject *u;
10553 char *sbuf = PyUnicode_DATA(self);
10554 char *buf1 = PyUnicode_DATA(str1);
10555 char *buf2 = PyUnicode_DATA(str2);
10556 int srelease = 0, release1 = 0, release2 = 0;
10557 int skind = PyUnicode_KIND(self);
10558 int kind1 = PyUnicode_KIND(str1);
10559 int kind2 = PyUnicode_KIND(str2);
10560 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10561 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10562 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010563 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010564 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565
10566 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010567 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010569 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570
Victor Stinner59de0ee2011-10-07 10:01:28 +020010571 if (str1 == str2)
10572 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010575 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10576 if (maxchar < maxchar_str1)
10577 /* substring too wide to be present */
10578 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10580 /* Replacing str1 with str2 may cause a maxchar reduction in the
10581 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010582 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010583 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010586 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010590 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010591 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010592 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010593
Victor Stinner69ed0f42013-04-09 21:48:24 +020010594 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010595 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010596 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010598 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010602
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010603 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10604 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010605 }
10606 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 int rkind = skind;
10608 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010609 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (kind1 < rkind) {
10612 /* widen substring */
10613 buf1 = _PyUnicode_AsKind(str1, rkind);
10614 if (!buf1) goto error;
10615 release1 = 1;
10616 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010617 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 if (i < 0)
10619 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (rkind > kind2) {
10621 /* widen replacement */
10622 buf2 = _PyUnicode_AsKind(str2, rkind);
10623 if (!buf2) goto error;
10624 release2 = 1;
10625 }
10626 else if (rkind < kind2) {
10627 /* widen self and buf1 */
10628 rkind = kind2;
10629 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010630 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 sbuf = _PyUnicode_AsKind(self, rkind);
10632 if (!sbuf) goto error;
10633 srelease = 1;
10634 buf1 = _PyUnicode_AsKind(str1, rkind);
10635 if (!buf1) goto error;
10636 release1 = 1;
10637 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 u = PyUnicode_New(slen, maxchar);
10639 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010641 assert(PyUnicode_KIND(u) == rkind);
10642 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010643
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010644 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010645 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010646 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010650
10651 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010652 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010654 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010655 if (i == -1)
10656 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010659 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010661 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010663 }
10664 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010666 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 int rkind = skind;
10668 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010671 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 buf1 = _PyUnicode_AsKind(str1, rkind);
10673 if (!buf1) goto error;
10674 release1 = 1;
10675 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010676 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 if (n == 0)
10678 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010680 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 buf2 = _PyUnicode_AsKind(str2, rkind);
10682 if (!buf2) goto error;
10683 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010686 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 rkind = kind2;
10688 sbuf = _PyUnicode_AsKind(self, rkind);
10689 if (!sbuf) goto error;
10690 srelease = 1;
10691 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010692 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 buf1 = _PyUnicode_AsKind(str1, rkind);
10694 if (!buf1) goto error;
10695 release1 = 1;
10696 }
10697 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10698 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010699 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 PyErr_SetString(PyExc_OverflowError,
10701 "replace string is too long");
10702 goto error;
10703 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010704 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010705 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010706 _Py_INCREF_UNICODE_EMPTY();
10707 if (!unicode_empty)
10708 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010709 u = unicode_empty;
10710 goto done;
10711 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010712 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 PyErr_SetString(PyExc_OverflowError,
10714 "replace string is too long");
10715 goto error;
10716 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010717 u = PyUnicode_New(new_size, maxchar);
10718 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010720 assert(PyUnicode_KIND(u) == rkind);
10721 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 ires = i = 0;
10723 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724 while (n-- > 0) {
10725 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010726 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010727 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010728 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010729 if (j == -1)
10730 break;
10731 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010733 memcpy(res + rkind * ires,
10734 sbuf + rkind * i,
10735 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010737 }
10738 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010740 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010742 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010748 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010749 memcpy(res + rkind * ires,
10750 sbuf + rkind * i,
10751 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010752 }
10753 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010754 /* interleave */
10755 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010758 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010760 if (--n <= 0)
10761 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010762 memcpy(res + rkind * ires,
10763 sbuf + rkind * i,
10764 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 ires++;
10766 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010767 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010768 memcpy(res + rkind * ires,
10769 sbuf + rkind * i,
10770 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010771 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010772 }
10773
10774 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010775 unicode_adjust_maxchar(&u);
10776 if (u == NULL)
10777 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010779
10780 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 if (srelease)
10782 PyMem_FREE(sbuf);
10783 if (release1)
10784 PyMem_FREE(buf1);
10785 if (release2)
10786 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010787 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010791 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010792 if (srelease)
10793 PyMem_FREE(sbuf);
10794 if (release1)
10795 PyMem_FREE(buf1);
10796 if (release2)
10797 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010798 return unicode_result_unchanged(self);
10799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 error:
10801 if (srelease && sbuf)
10802 PyMem_FREE(sbuf);
10803 if (release1 && buf1)
10804 PyMem_FREE(buf1);
10805 if (release2 && buf2)
10806 PyMem_FREE(buf2);
10807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808}
10809
10810/* --- Unicode Object Methods --------------------------------------------- */
10811
INADA Naoki3ae20562017-01-16 20:41:20 +090010812/*[clinic input]
10813str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814
INADA Naoki3ae20562017-01-16 20:41:20 +090010815Return a version of the string where each word is titlecased.
10816
10817More specifically, words start with uppercased characters and all remaining
10818cased characters have lower case.
10819[clinic start generated code]*/
10820
10821static PyObject *
10822unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010823/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010825 if (PyUnicode_READY(self) == -1)
10826 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010827 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828}
10829
INADA Naoki3ae20562017-01-16 20:41:20 +090010830/*[clinic input]
10831str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
INADA Naoki3ae20562017-01-16 20:41:20 +090010833Return a capitalized version of the string.
10834
10835More specifically, make the first character have upper case and the rest lower
10836case.
10837[clinic start generated code]*/
10838
10839static PyObject *
10840unicode_capitalize_impl(PyObject *self)
10841/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010843 if (PyUnicode_READY(self) == -1)
10844 return NULL;
10845 if (PyUnicode_GET_LENGTH(self) == 0)
10846 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010847 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848}
10849
INADA Naoki3ae20562017-01-16 20:41:20 +090010850/*[clinic input]
10851str.casefold as unicode_casefold
10852
10853Return a version of the string suitable for caseless comparisons.
10854[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010855
10856static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010857unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010858/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010859{
10860 if (PyUnicode_READY(self) == -1)
10861 return NULL;
10862 if (PyUnicode_IS_ASCII(self))
10863 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010864 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010865}
10866
10867
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010868/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010869
10870static int
10871convert_uc(PyObject *obj, void *addr)
10872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010874
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010875 if (!PyUnicode_Check(obj)) {
10876 PyErr_Format(PyExc_TypeError,
10877 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010878 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 return 0;
10880 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010881 if (PyUnicode_READY(obj) < 0)
10882 return 0;
10883 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010884 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010885 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 return 0;
10887 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010888 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010890}
10891
INADA Naoki3ae20562017-01-16 20:41:20 +090010892/*[clinic input]
10893str.center as unicode_center
10894
10895 width: Py_ssize_t
10896 fillchar: Py_UCS4 = ' '
10897 /
10898
10899Return a centered string of length width.
10900
10901Padding is done using the specified fill character (default is a space).
10902[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
10904static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010905unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10906/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010908 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909
Benjamin Petersonbac79492012-01-14 13:34:47 -050010910 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911 return NULL;
10912
Victor Stinnerc4b49542011-12-11 22:44:26 +010010913 if (PyUnicode_GET_LENGTH(self) >= width)
10914 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
Victor Stinnerc4b49542011-12-11 22:44:26 +010010916 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917 left = marg / 2 + (marg & width & 1);
10918
Victor Stinner9310abb2011-10-05 00:59:23 +020010919 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920}
10921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922/* This function assumes that str1 and str2 are readied by the caller. */
10923
Marc-André Lemburge5034372000-08-08 08:04:29 +000010924static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010925unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010926{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010927#define COMPARE(TYPE1, TYPE2) \
10928 do { \
10929 TYPE1* p1 = (TYPE1 *)data1; \
10930 TYPE2* p2 = (TYPE2 *)data2; \
10931 TYPE1* end = p1 + len; \
10932 Py_UCS4 c1, c2; \
10933 for (; p1 != end; p1++, p2++) { \
10934 c1 = *p1; \
10935 c2 = *p2; \
10936 if (c1 != c2) \
10937 return (c1 < c2) ? -1 : 1; \
10938 } \
10939 } \
10940 while (0)
10941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 int kind1, kind2;
10943 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010944 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 kind1 = PyUnicode_KIND(str1);
10947 kind2 = PyUnicode_KIND(str2);
10948 data1 = PyUnicode_DATA(str1);
10949 data2 = PyUnicode_DATA(str2);
10950 len1 = PyUnicode_GET_LENGTH(str1);
10951 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010952 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010953
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 switch(kind1) {
10955 case PyUnicode_1BYTE_KIND:
10956 {
10957 switch(kind2) {
10958 case PyUnicode_1BYTE_KIND:
10959 {
10960 int cmp = memcmp(data1, data2, len);
10961 /* normalize result of memcmp() into the range [-1; 1] */
10962 if (cmp < 0)
10963 return -1;
10964 if (cmp > 0)
10965 return 1;
10966 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010967 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010968 case PyUnicode_2BYTE_KIND:
10969 COMPARE(Py_UCS1, Py_UCS2);
10970 break;
10971 case PyUnicode_4BYTE_KIND:
10972 COMPARE(Py_UCS1, Py_UCS4);
10973 break;
10974 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010975 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010976 }
10977 break;
10978 }
10979 case PyUnicode_2BYTE_KIND:
10980 {
10981 switch(kind2) {
10982 case PyUnicode_1BYTE_KIND:
10983 COMPARE(Py_UCS2, Py_UCS1);
10984 break;
10985 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010986 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010987 COMPARE(Py_UCS2, Py_UCS2);
10988 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010989 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010990 case PyUnicode_4BYTE_KIND:
10991 COMPARE(Py_UCS2, Py_UCS4);
10992 break;
10993 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010994 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010995 }
10996 break;
10997 }
10998 case PyUnicode_4BYTE_KIND:
10999 {
11000 switch(kind2) {
11001 case PyUnicode_1BYTE_KIND:
11002 COMPARE(Py_UCS4, Py_UCS1);
11003 break;
11004 case PyUnicode_2BYTE_KIND:
11005 COMPARE(Py_UCS4, Py_UCS2);
11006 break;
11007 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011008 {
11009#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11010 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11011 /* normalize result of wmemcmp() into the range [-1; 1] */
11012 if (cmp < 0)
11013 return -1;
11014 if (cmp > 0)
11015 return 1;
11016#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011017 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011018#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011019 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011020 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011021 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011022 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011023 }
11024 break;
11025 }
11026 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011027 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011028 }
11029
Victor Stinner770e19e2012-10-04 22:59:45 +020011030 if (len1 == len2)
11031 return 0;
11032 if (len1 < len2)
11033 return -1;
11034 else
11035 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011036
11037#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011038}
11039
Benjamin Peterson621b4302016-09-09 13:54:34 -070011040static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011041unicode_compare_eq(PyObject *str1, PyObject *str2)
11042{
11043 int kind;
11044 void *data1, *data2;
11045 Py_ssize_t len;
11046 int cmp;
11047
Victor Stinnere5567ad2012-10-23 02:48:49 +020011048 len = PyUnicode_GET_LENGTH(str1);
11049 if (PyUnicode_GET_LENGTH(str2) != len)
11050 return 0;
11051 kind = PyUnicode_KIND(str1);
11052 if (PyUnicode_KIND(str2) != kind)
11053 return 0;
11054 data1 = PyUnicode_DATA(str1);
11055 data2 = PyUnicode_DATA(str2);
11056
11057 cmp = memcmp(data1, data2, len * kind);
11058 return (cmp == 0);
11059}
11060
11061
Alexander Belopolsky40018472011-02-26 01:02:56 +000011062int
11063PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11066 if (PyUnicode_READY(left) == -1 ||
11067 PyUnicode_READY(right) == -1)
11068 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011069
11070 /* a string is equal to itself */
11071 if (left == right)
11072 return 0;
11073
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011074 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011076 PyErr_Format(PyExc_TypeError,
11077 "Can't compare %.100s and %.100s",
11078 left->ob_type->tp_name,
11079 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 return -1;
11081}
11082
Martin v. Löwis5b222132007-06-10 09:51:05 +000011083int
11084PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 Py_ssize_t i;
11087 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011089 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090
Victor Stinner910337b2011-10-03 03:20:16 +020011091 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011092 if (!PyUnicode_IS_READY(uni)) {
11093 const wchar_t *ws = _PyUnicode_WSTR(uni);
11094 /* Compare Unicode string and source character set string */
11095 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11096 if (chr != ustr[i])
11097 return (chr < ustr[i]) ? -1 : 1;
11098 }
11099 /* This check keeps Python strings that end in '\0' from comparing equal
11100 to C strings identical up to that point. */
11101 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11102 return 1; /* uni is longer */
11103 if (ustr[i])
11104 return -1; /* str is longer */
11105 return 0;
11106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011108 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011109 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011110 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011111 size_t len, len2 = strlen(str);
11112 int cmp;
11113
11114 len = Py_MIN(len1, len2);
11115 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011116 if (cmp != 0) {
11117 if (cmp < 0)
11118 return -1;
11119 else
11120 return 1;
11121 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011122 if (len1 > len2)
11123 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011124 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011125 return -1; /* str is longer */
11126 return 0;
11127 }
11128 else {
11129 void *data = PyUnicode_DATA(uni);
11130 /* Compare Unicode string and source character set string */
11131 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011132 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011133 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11134 /* This check keeps Python strings that end in '\0' from comparing equal
11135 to C strings identical up to that point. */
11136 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11137 return 1; /* uni is longer */
11138 if (str[i])
11139 return -1; /* str is longer */
11140 return 0;
11141 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011142}
11143
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011144static int
11145non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11146{
11147 size_t i, len;
11148 const wchar_t *p;
11149 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11150 if (strlen(str) != len)
11151 return 0;
11152 p = _PyUnicode_WSTR(unicode);
11153 assert(p);
11154 for (i = 0; i < len; i++) {
11155 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011156 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011157 return 0;
11158 }
11159 return 1;
11160}
11161
11162int
11163_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11164{
11165 size_t len;
11166 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011167 assert(str);
11168#ifndef NDEBUG
11169 for (const char *p = str; *p; p++) {
11170 assert((unsigned char)*p < 128);
11171 }
11172#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011173 if (PyUnicode_READY(unicode) == -1) {
11174 /* Memory error or bad data */
11175 PyErr_Clear();
11176 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11177 }
11178 if (!PyUnicode_IS_ASCII(unicode))
11179 return 0;
11180 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11181 return strlen(str) == len &&
11182 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11183}
11184
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011185int
11186_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11187{
11188 PyObject *right_uni;
11189 Py_hash_t hash;
11190
11191 assert(_PyUnicode_CHECK(left));
11192 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011193#ifndef NDEBUG
11194 for (const char *p = right->string; *p; p++) {
11195 assert((unsigned char)*p < 128);
11196 }
11197#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011198
11199 if (PyUnicode_READY(left) == -1) {
11200 /* memory error or bad data */
11201 PyErr_Clear();
11202 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11203 }
11204
11205 if (!PyUnicode_IS_ASCII(left))
11206 return 0;
11207
11208 right_uni = _PyUnicode_FromId(right); /* borrowed */
11209 if (right_uni == NULL) {
11210 /* memory error or bad data */
11211 PyErr_Clear();
11212 return _PyUnicode_EqualToASCIIString(left, right->string);
11213 }
11214
11215 if (left == right_uni)
11216 return 1;
11217
11218 if (PyUnicode_CHECK_INTERNED(left))
11219 return 0;
11220
INADA Naoki7cc95f52018-01-28 02:07:09 +090011221 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011222 hash = _PyUnicode_HASH(left);
11223 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11224 return 0;
11225
11226 return unicode_compare_eq(left, right_uni);
11227}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011228
Alexander Belopolsky40018472011-02-26 01:02:56 +000011229PyObject *
11230PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011231{
11232 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011233
Victor Stinnere5567ad2012-10-23 02:48:49 +020011234 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11235 Py_RETURN_NOTIMPLEMENTED;
11236
11237 if (PyUnicode_READY(left) == -1 ||
11238 PyUnicode_READY(right) == -1)
11239 return NULL;
11240
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011241 if (left == right) {
11242 switch (op) {
11243 case Py_EQ:
11244 case Py_LE:
11245 case Py_GE:
11246 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011247 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011248 case Py_NE:
11249 case Py_LT:
11250 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011251 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011252 default:
11253 PyErr_BadArgument();
11254 return NULL;
11255 }
11256 }
11257 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011258 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011259 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011260 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011261 }
11262 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011263 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011264 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011265 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011266}
11267
Alexander Belopolsky40018472011-02-26 01:02:56 +000011268int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011269_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11270{
11271 return unicode_eq(aa, bb);
11272}
11273
11274int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011275PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011276{
Victor Stinner77282cb2013-04-14 19:22:47 +020011277 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 void *buf1, *buf2;
11279 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011280 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011281
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011282 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011284 "'in <string>' requires string as left operand, not %.100s",
11285 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011286 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011287 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011288 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011289 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011290 if (ensure_unicode(str) < 0)
11291 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 kind2 = PyUnicode_KIND(substr);
11295 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011296 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011298 len2 = PyUnicode_GET_LENGTH(substr);
11299 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011300 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011301 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011303 if (len2 == 1) {
11304 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11305 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011306 return result;
11307 }
11308 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 buf2 = _PyUnicode_AsKind(substr, kind1);
11310 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011311 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313
Victor Stinner77282cb2013-04-14 19:22:47 +020011314 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 case PyUnicode_1BYTE_KIND:
11316 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11317 break;
11318 case PyUnicode_2BYTE_KIND:
11319 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11320 break;
11321 case PyUnicode_4BYTE_KIND:
11322 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11323 break;
11324 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011325 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011327
Victor Stinner77282cb2013-04-14 19:22:47 +020011328 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 PyMem_Free(buf2);
11330
Guido van Rossum403d68b2000-03-13 15:55:09 +000011331 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011332}
11333
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334/* Concat to string or Unicode object giving a new Unicode object. */
11335
Alexander Belopolsky40018472011-02-26 01:02:56 +000011336PyObject *
11337PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011339 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011340 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011341 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011343 if (ensure_unicode(left) < 0)
11344 return NULL;
11345
11346 if (!PyUnicode_Check(right)) {
11347 PyErr_Format(PyExc_TypeError,
11348 "can only concatenate str (not \"%.200s\") to str",
11349 right->ob_type->tp_name);
11350 return NULL;
11351 }
11352 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011353 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011356 if (left == unicode_empty)
11357 return PyUnicode_FromObject(right);
11358 if (right == unicode_empty)
11359 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011361 left_len = PyUnicode_GET_LENGTH(left);
11362 right_len = PyUnicode_GET_LENGTH(right);
11363 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011364 PyErr_SetString(PyExc_OverflowError,
11365 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011366 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011367 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011369
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011370 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11371 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011372 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 result = PyUnicode_New(new_len, maxchar);
11376 if (result == NULL)
11377 return NULL;
11378 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11379 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11380 assert(_PyUnicode_CheckConsistency(result, 1));
11381 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Walter Dörwald1ab83302007-05-18 17:15:44 +000011384void
Victor Stinner23e56682011-10-03 03:54:37 +020011385PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011386{
Victor Stinner23e56682011-10-03 03:54:37 +020011387 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011388 Py_UCS4 maxchar, maxchar2;
11389 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011390
11391 if (p_left == NULL) {
11392 if (!PyErr_Occurred())
11393 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011394 return;
11395 }
Victor Stinner23e56682011-10-03 03:54:37 +020011396 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011397 if (right == NULL || left == NULL
11398 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011399 if (!PyErr_Occurred())
11400 PyErr_BadInternalCall();
11401 goto error;
11402 }
11403
Benjamin Petersonbac79492012-01-14 13:34:47 -050011404 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011405 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011406 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011407 goto error;
11408
Victor Stinner488fa492011-12-12 00:01:39 +010011409 /* Shortcuts */
11410 if (left == unicode_empty) {
11411 Py_DECREF(left);
11412 Py_INCREF(right);
11413 *p_left = right;
11414 return;
11415 }
11416 if (right == unicode_empty)
11417 return;
11418
11419 left_len = PyUnicode_GET_LENGTH(left);
11420 right_len = PyUnicode_GET_LENGTH(right);
11421 if (left_len > PY_SSIZE_T_MAX - right_len) {
11422 PyErr_SetString(PyExc_OverflowError,
11423 "strings are too large to concat");
11424 goto error;
11425 }
11426 new_len = left_len + right_len;
11427
11428 if (unicode_modifiable(left)
11429 && PyUnicode_CheckExact(right)
11430 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011431 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11432 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011433 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011434 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011435 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11436 {
11437 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011438 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011439 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011440
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011441 /* copy 'right' into the newly allocated area of 'left' */
11442 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011443 }
Victor Stinner488fa492011-12-12 00:01:39 +010011444 else {
11445 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11446 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011447 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011448
Victor Stinner488fa492011-12-12 00:01:39 +010011449 /* Concat the two Unicode strings */
11450 res = PyUnicode_New(new_len, maxchar);
11451 if (res == NULL)
11452 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011453 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11454 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011455 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011456 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011457 }
11458 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011459 return;
11460
11461error:
Victor Stinner488fa492011-12-12 00:01:39 +010011462 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011463}
11464
11465void
11466PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11467{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011468 PyUnicode_Append(pleft, right);
11469 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011470}
11471
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011472/*
11473Wraps stringlib_parse_args_finds() and additionally ensures that the
11474first argument is a unicode object.
11475*/
11476
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011477static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011478parse_args_finds_unicode(const char * function_name, PyObject *args,
11479 PyObject **substring,
11480 Py_ssize_t *start, Py_ssize_t *end)
11481{
11482 if(stringlib_parse_args_finds(function_name, args, substring,
11483 start, end)) {
11484 if (ensure_unicode(*substring) < 0)
11485 return 0;
11486 return 1;
11487 }
11488 return 0;
11489}
11490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011491PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011494Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011495string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
11498static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011499unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011501 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011502 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011503 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011505 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 void *buf1, *buf2;
11507 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011509 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 kind1 = PyUnicode_KIND(self);
11513 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011515 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 len1 = PyUnicode_GET_LENGTH(self);
11518 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011520 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011521 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011522
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011523 buf1 = PyUnicode_DATA(self);
11524 buf2 = PyUnicode_DATA(substring);
11525 if (kind2 != kind1) {
11526 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011527 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011528 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011529 }
11530 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 case PyUnicode_1BYTE_KIND:
11532 iresult = ucs1lib_count(
11533 ((Py_UCS1*)buf1) + start, end - start,
11534 buf2, len2, PY_SSIZE_T_MAX
11535 );
11536 break;
11537 case PyUnicode_2BYTE_KIND:
11538 iresult = ucs2lib_count(
11539 ((Py_UCS2*)buf1) + start, end - start,
11540 buf2, len2, PY_SSIZE_T_MAX
11541 );
11542 break;
11543 case PyUnicode_4BYTE_KIND:
11544 iresult = ucs4lib_count(
11545 ((Py_UCS4*)buf1) + start, end - start,
11546 buf2, len2, PY_SSIZE_T_MAX
11547 );
11548 break;
11549 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011550 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 }
11552
11553 result = PyLong_FromSsize_t(iresult);
11554
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011555 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 return result;
11559}
11560
INADA Naoki3ae20562017-01-16 20:41:20 +090011561/*[clinic input]
11562str.encode as unicode_encode
11563
11564 encoding: str(c_default="NULL") = 'utf-8'
11565 The encoding in which to encode the string.
11566 errors: str(c_default="NULL") = 'strict'
11567 The error handling scheme to use for encoding errors.
11568 The default is 'strict' meaning that encoding errors raise a
11569 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11570 'xmlcharrefreplace' as well as any other name registered with
11571 codecs.register_error that can handle UnicodeEncodeErrors.
11572
11573Encode the string using the codec registered for encoding.
11574[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
11576static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011577unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011578/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011580 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011581}
11582
INADA Naoki3ae20562017-01-16 20:41:20 +090011583/*[clinic input]
11584str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
INADA Naoki3ae20562017-01-16 20:41:20 +090011586 tabsize: int = 8
11587
11588Return a copy where all tab characters are expanded using spaces.
11589
11590If tabsize is not given, a tab size of 8 characters is assumed.
11591[clinic start generated code]*/
11592
11593static PyObject *
11594unicode_expandtabs_impl(PyObject *self, int tabsize)
11595/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 Py_ssize_t i, j, line_pos, src_len, incr;
11598 Py_UCS4 ch;
11599 PyObject *u;
11600 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011602 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Antoine Pitrou22425222011-10-04 19:10:51 +020011604 if (PyUnicode_READY(self) == -1)
11605 return NULL;
11606
Thomas Wouters7e474022000-07-16 12:04:32 +000011607 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011608 src_len = PyUnicode_GET_LENGTH(self);
11609 i = j = line_pos = 0;
11610 kind = PyUnicode_KIND(self);
11611 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011612 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011613 for (; i < src_len; i++) {
11614 ch = PyUnicode_READ(kind, src_data, i);
11615 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011616 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011618 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 goto overflow;
11621 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011623 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011627 goto overflow;
11628 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011630 if (ch == '\n' || ch == '\r')
11631 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011633 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011634 if (!found)
11635 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011636
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011638 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 if (!u)
11640 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011641 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
Antoine Pitroue71d5742011-10-04 15:55:09 +020011643 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
Antoine Pitroue71d5742011-10-04 15:55:09 +020011645 for (; i < src_len; i++) {
11646 ch = PyUnicode_READ(kind, src_data, i);
11647 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011649 incr = tabsize - (line_pos % tabsize);
11650 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011651 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011652 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011656 line_pos++;
11657 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011658 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011659 if (ch == '\n' || ch == '\r')
11660 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011662 }
11663 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011664 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011665
Antoine Pitroue71d5742011-10-04 15:55:09 +020011666 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011667 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673\n\
11674Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011675such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676arguments start and end are interpreted as in slice notation.\n\
11677\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011678Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
11680static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011683 /* initialize variables to prevent gcc warning */
11684 PyObject *substring = NULL;
11685 Py_ssize_t start = 0;
11686 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011687 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011689 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011692 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011695 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 if (result == -2)
11698 return NULL;
11699
Christian Heimes217cfd12007-12-02 14:31:20 +000011700 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701}
11702
11703static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011704unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011706 void *data;
11707 enum PyUnicode_Kind kind;
11708 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011709
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011710 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011711 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011713 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011714 if (PyUnicode_READY(self) == -1) {
11715 return NULL;
11716 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011717 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11718 PyErr_SetString(PyExc_IndexError, "string index out of range");
11719 return NULL;
11720 }
11721 kind = PyUnicode_KIND(self);
11722 data = PyUnicode_DATA(self);
11723 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011724 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725}
11726
Guido van Rossumc2504932007-09-18 19:42:40 +000011727/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011728 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011729static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011730unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011732 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011733
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011734#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011735 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011736#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 if (_PyUnicode_HASH(self) != -1)
11738 return _PyUnicode_HASH(self);
11739 if (PyUnicode_READY(self) == -1)
11740 return -1;
animalizea1d14252019-01-02 20:16:06 +080011741
Christian Heimes985ecdc2013-11-20 11:46:18 +010011742 x = _Py_HashBytes(PyUnicode_DATA(self),
11743 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011745 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746}
11747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011748PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750\n\
oldkaa0735f2018-02-02 16:52:55 +080011751Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011752such that sub is contained within S[start:end]. Optional\n\
11753arguments start and end are interpreted as in slice notation.\n\
11754\n\
11755Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756
11757static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011760 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011761 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011762 PyObject *substring = NULL;
11763 Py_ssize_t start = 0;
11764 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011766 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011769 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011772 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (result == -2)
11775 return NULL;
11776
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 if (result < 0) {
11778 PyErr_SetString(PyExc_ValueError, "substring not found");
11779 return NULL;
11780 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011781
Christian Heimes217cfd12007-12-02 14:31:20 +000011782 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783}
11784
INADA Naoki3ae20562017-01-16 20:41:20 +090011785/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011786str.isascii as unicode_isascii
11787
11788Return True if all characters in the string are ASCII, False otherwise.
11789
11790ASCII characters have code points in the range U+0000-U+007F.
11791Empty string is ASCII too.
11792[clinic start generated code]*/
11793
11794static PyObject *
11795unicode_isascii_impl(PyObject *self)
11796/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11797{
11798 if (PyUnicode_READY(self) == -1) {
11799 return NULL;
11800 }
11801 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11802}
11803
11804/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011805str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806
INADA Naoki3ae20562017-01-16 20:41:20 +090011807Return True if the string is a lowercase string, False otherwise.
11808
11809A string is lowercase if all cased characters in the string are lowercase and
11810there is at least one cased character in the string.
11811[clinic start generated code]*/
11812
11813static PyObject *
11814unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011815/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 Py_ssize_t i, length;
11818 int kind;
11819 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 int cased;
11821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 if (PyUnicode_READY(self) == -1)
11823 return NULL;
11824 length = PyUnicode_GET_LENGTH(self);
11825 kind = PyUnicode_KIND(self);
11826 data = PyUnicode_DATA(self);
11827
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (length == 1)
11830 return PyBool_FromLong(
11831 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011833 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011835 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011836
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 for (i = 0; i < length; i++) {
11839 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011840
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011842 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011843 else if (!cased && Py_UNICODE_ISLOWER(ch))
11844 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011846 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847}
11848
INADA Naoki3ae20562017-01-16 20:41:20 +090011849/*[clinic input]
11850str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
INADA Naoki3ae20562017-01-16 20:41:20 +090011852Return True if the string is an uppercase string, False otherwise.
11853
11854A string is uppercase if all cased characters in the string are uppercase and
11855there is at least one cased character in the string.
11856[clinic start generated code]*/
11857
11858static PyObject *
11859unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011860/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 Py_ssize_t i, length;
11863 int kind;
11864 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 int cased;
11866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (PyUnicode_READY(self) == -1)
11868 return NULL;
11869 length = PyUnicode_GET_LENGTH(self);
11870 kind = PyUnicode_KIND(self);
11871 data = PyUnicode_DATA(self);
11872
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (length == 1)
11875 return PyBool_FromLong(
11876 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011878 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011880 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011881
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 for (i = 0; i < length; i++) {
11884 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011885
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011887 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011888 else if (!cased && Py_UNICODE_ISUPPER(ch))
11889 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011891 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892}
11893
INADA Naoki3ae20562017-01-16 20:41:20 +090011894/*[clinic input]
11895str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
INADA Naoki3ae20562017-01-16 20:41:20 +090011897Return True if the string is a title-cased string, False otherwise.
11898
11899In a title-cased string, upper- and title-case characters may only
11900follow uncased characters and lowercase characters only cased ones.
11901[clinic start generated code]*/
11902
11903static PyObject *
11904unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011905/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 Py_ssize_t i, length;
11908 int kind;
11909 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 int cased, previous_is_cased;
11911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (PyUnicode_READY(self) == -1)
11913 return NULL;
11914 length = PyUnicode_GET_LENGTH(self);
11915 kind = PyUnicode_KIND(self);
11916 data = PyUnicode_DATA(self);
11917
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (length == 1) {
11920 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11921 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11922 (Py_UNICODE_ISUPPER(ch) != 0));
11923 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011925 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011927 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011928
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 cased = 0;
11930 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 for (i = 0; i < length; i++) {
11932 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011933
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11935 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 previous_is_cased = 1;
11938 cased = 1;
11939 }
11940 else if (Py_UNICODE_ISLOWER(ch)) {
11941 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011942 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 previous_is_cased = 1;
11944 cased = 1;
11945 }
11946 else
11947 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011949 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950}
11951
INADA Naoki3ae20562017-01-16 20:41:20 +090011952/*[clinic input]
11953str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
INADA Naoki3ae20562017-01-16 20:41:20 +090011955Return True if the string is a whitespace string, False otherwise.
11956
11957A string is whitespace if all characters in the string are whitespace and there
11958is at least one character in the string.
11959[clinic start generated code]*/
11960
11961static PyObject *
11962unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011963/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 Py_ssize_t i, length;
11966 int kind;
11967 void *data;
11968
11969 if (PyUnicode_READY(self) == -1)
11970 return NULL;
11971 length = PyUnicode_GET_LENGTH(self);
11972 kind = PyUnicode_KIND(self);
11973 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (length == 1)
11977 return PyBool_FromLong(
11978 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011980 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011982 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 for (i = 0; i < length; i++) {
11985 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011986 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011987 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011989 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990}
11991
INADA Naoki3ae20562017-01-16 20:41:20 +090011992/*[clinic input]
11993str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011994
INADA Naoki3ae20562017-01-16 20:41:20 +090011995Return True if the string is an alphabetic string, False otherwise.
11996
11997A string is alphabetic if all characters in the string are alphabetic and there
11998is at least one character in the string.
11999[clinic start generated code]*/
12000
12001static PyObject *
12002unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012003/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 Py_ssize_t i, length;
12006 int kind;
12007 void *data;
12008
12009 if (PyUnicode_READY(self) == -1)
12010 return NULL;
12011 length = PyUnicode_GET_LENGTH(self);
12012 kind = PyUnicode_KIND(self);
12013 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012014
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012015 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 if (length == 1)
12017 return PyBool_FromLong(
12018 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012019
12020 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012022 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 for (i = 0; i < length; i++) {
12025 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012026 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012027 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012028 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012029}
12030
INADA Naoki3ae20562017-01-16 20:41:20 +090012031/*[clinic input]
12032str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012033
INADA Naoki3ae20562017-01-16 20:41:20 +090012034Return True if the string is an alpha-numeric string, False otherwise.
12035
12036A string is alpha-numeric if all characters in the string are alpha-numeric and
12037there is at least one character in the string.
12038[clinic start generated code]*/
12039
12040static PyObject *
12041unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012042/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 int kind;
12045 void *data;
12046 Py_ssize_t len, i;
12047
12048 if (PyUnicode_READY(self) == -1)
12049 return NULL;
12050
12051 kind = PyUnicode_KIND(self);
12052 data = PyUnicode_DATA(self);
12053 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012054
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012055 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (len == 1) {
12057 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12058 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12059 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012060
12061 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012063 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 for (i = 0; i < len; i++) {
12066 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012067 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012068 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012069 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012071}
12072
INADA Naoki3ae20562017-01-16 20:41:20 +090012073/*[clinic input]
12074str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
INADA Naoki3ae20562017-01-16 20:41:20 +090012076Return True if the string is a decimal string, False otherwise.
12077
12078A string is a decimal string if all characters in the string are decimal and
12079there is at least one character in the string.
12080[clinic start generated code]*/
12081
12082static PyObject *
12083unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012084/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 Py_ssize_t i, length;
12087 int kind;
12088 void *data;
12089
12090 if (PyUnicode_READY(self) == -1)
12091 return NULL;
12092 length = PyUnicode_GET_LENGTH(self);
12093 kind = PyUnicode_KIND(self);
12094 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097 if (length == 1)
12098 return PyBool_FromLong(
12099 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012101 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012103 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 for (i = 0; i < length; i++) {
12106 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012107 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012109 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110}
12111
INADA Naoki3ae20562017-01-16 20:41:20 +090012112/*[clinic input]
12113str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
INADA Naoki3ae20562017-01-16 20:41:20 +090012115Return True if the string is a digit string, False otherwise.
12116
12117A string is a digit string if all characters in the string are digits and there
12118is at least one character in the string.
12119[clinic start generated code]*/
12120
12121static PyObject *
12122unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012123/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 Py_ssize_t i, length;
12126 int kind;
12127 void *data;
12128
12129 if (PyUnicode_READY(self) == -1)
12130 return NULL;
12131 length = PyUnicode_GET_LENGTH(self);
12132 kind = PyUnicode_KIND(self);
12133 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 if (length == 1) {
12137 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12138 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012141 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012143 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 for (i = 0; i < length; i++) {
12146 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012147 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012149 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150}
12151
INADA Naoki3ae20562017-01-16 20:41:20 +090012152/*[clinic input]
12153str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
INADA Naoki3ae20562017-01-16 20:41:20 +090012155Return True if the string is a numeric string, False otherwise.
12156
12157A string is numeric if all characters in the string are numeric and there is at
12158least one character in the string.
12159[clinic start generated code]*/
12160
12161static PyObject *
12162unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012163/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 Py_ssize_t i, length;
12166 int kind;
12167 void *data;
12168
12169 if (PyUnicode_READY(self) == -1)
12170 return NULL;
12171 length = PyUnicode_GET_LENGTH(self);
12172 kind = PyUnicode_KIND(self);
12173 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (length == 1)
12177 return PyBool_FromLong(
12178 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012180 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012182 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 for (i = 0; i < length; i++) {
12185 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012186 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012188 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189}
12190
Martin v. Löwis47383402007-08-15 07:32:56 +000012191int
12192PyUnicode_IsIdentifier(PyObject *self)
12193{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 int kind;
12195 void *data;
12196 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012197 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (PyUnicode_READY(self) == -1) {
12200 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 }
12203
12204 /* Special case for empty strings */
12205 if (PyUnicode_GET_LENGTH(self) == 0)
12206 return 0;
12207 kind = PyUnicode_KIND(self);
12208 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012209
12210 /* PEP 3131 says that the first character must be in
12211 XID_Start and subsequent characters in XID_Continue,
12212 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012213 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012214 letters, digits, underscore). However, given the current
12215 definition of XID_Start and XID_Continue, it is sufficient
12216 to check just for these, except that _ must be allowed
12217 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012219 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012220 return 0;
12221
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012222 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012225 return 1;
12226}
12227
INADA Naoki3ae20562017-01-16 20:41:20 +090012228/*[clinic input]
12229str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012230
INADA Naoki3ae20562017-01-16 20:41:20 +090012231Return True if the string is a valid Python identifier, False otherwise.
12232
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012233Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012234such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012235[clinic start generated code]*/
12236
12237static PyObject *
12238unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012239/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012240{
12241 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12242}
12243
INADA Naoki3ae20562017-01-16 20:41:20 +090012244/*[clinic input]
12245str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012246
INADA Naoki3ae20562017-01-16 20:41:20 +090012247Return True if the string is printable, False otherwise.
12248
12249A string is printable if all of its characters are considered printable in
12250repr() or if it is empty.
12251[clinic start generated code]*/
12252
12253static PyObject *
12254unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012255/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 Py_ssize_t i, length;
12258 int kind;
12259 void *data;
12260
12261 if (PyUnicode_READY(self) == -1)
12262 return NULL;
12263 length = PyUnicode_GET_LENGTH(self);
12264 kind = PyUnicode_KIND(self);
12265 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012266
12267 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (length == 1)
12269 return PyBool_FromLong(
12270 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 for (i = 0; i < length; i++) {
12273 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012274 Py_RETURN_FALSE;
12275 }
12276 }
12277 Py_RETURN_TRUE;
12278}
12279
INADA Naoki3ae20562017-01-16 20:41:20 +090012280/*[clinic input]
12281str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
INADA Naoki3ae20562017-01-16 20:41:20 +090012283 iterable: object
12284 /
12285
12286Concatenate any number of strings.
12287
Martin Panter91a88662017-01-24 00:30:06 +000012288The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012289The result is returned as a new string.
12290
12291Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12292[clinic start generated code]*/
12293
12294static PyObject *
12295unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012296/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297{
INADA Naoki3ae20562017-01-16 20:41:20 +090012298 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299}
12300
Martin v. Löwis18e16552006-02-15 17:27:45 +000012301static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012302unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (PyUnicode_READY(self) == -1)
12305 return -1;
12306 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307}
12308
INADA Naoki3ae20562017-01-16 20:41:20 +090012309/*[clinic input]
12310str.ljust as unicode_ljust
12311
12312 width: Py_ssize_t
12313 fillchar: Py_UCS4 = ' '
12314 /
12315
12316Return a left-justified string of length width.
12317
12318Padding is done using the specified fill character (default is a space).
12319[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320
12321static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012322unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12323/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012325 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327
Victor Stinnerc4b49542011-12-11 22:44:26 +010012328 if (PyUnicode_GET_LENGTH(self) >= width)
12329 return unicode_result_unchanged(self);
12330
12331 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332}
12333
INADA Naoki3ae20562017-01-16 20:41:20 +090012334/*[clinic input]
12335str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
INADA Naoki3ae20562017-01-16 20:41:20 +090012337Return a copy of the string converted to lowercase.
12338[clinic start generated code]*/
12339
12340static PyObject *
12341unicode_lower_impl(PyObject *self)
12342/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012344 if (PyUnicode_READY(self) == -1)
12345 return NULL;
12346 if (PyUnicode_IS_ASCII(self))
12347 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012348 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349}
12350
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012351#define LEFTSTRIP 0
12352#define RIGHTSTRIP 1
12353#define BOTHSTRIP 2
12354
12355/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012356static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012357
INADA Naoki3ae20562017-01-16 20:41:20 +090012358#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012359
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012360/* externally visible for str.strip(unicode) */
12361PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012362_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 void *data;
12365 int kind;
12366 Py_ssize_t i, j, len;
12367 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012368 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12371 return NULL;
12372
12373 kind = PyUnicode_KIND(self);
12374 data = PyUnicode_DATA(self);
12375 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012376 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12378 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012379 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381 i = 0;
12382 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012383 while (i < len) {
12384 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12385 if (!BLOOM(sepmask, ch))
12386 break;
12387 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12388 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 i++;
12390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012391 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012392
Benjamin Peterson14339b62009-01-31 16:36:08 +000012393 j = len;
12394 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012395 j--;
12396 while (j >= i) {
12397 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12398 if (!BLOOM(sepmask, ch))
12399 break;
12400 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12401 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012403 }
12404
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012406 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012407
Victor Stinner7931d9a2011-11-04 00:22:48 +010012408 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409}
12410
12411PyObject*
12412PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12413{
12414 unsigned char *data;
12415 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012416 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417
Victor Stinnerde636f32011-10-01 03:55:54 +020012418 if (PyUnicode_READY(self) == -1)
12419 return NULL;
12420
Victor Stinner684d5fd2012-05-03 02:32:34 +020012421 length = PyUnicode_GET_LENGTH(self);
12422 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012423
Victor Stinner684d5fd2012-05-03 02:32:34 +020012424 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012425 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426
Victor Stinnerde636f32011-10-01 03:55:54 +020012427 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012428 PyErr_SetString(PyExc_IndexError, "string index out of range");
12429 return NULL;
12430 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012431 if (start >= length || end < start)
12432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012433
Victor Stinner684d5fd2012-05-03 02:32:34 +020012434 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012435 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012436 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012437 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012438 }
12439 else {
12440 kind = PyUnicode_KIND(self);
12441 data = PyUnicode_1BYTE_DATA(self);
12442 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012443 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012444 length);
12445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447
12448static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012449do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 Py_ssize_t len, i, j;
12452
12453 if (PyUnicode_READY(self) == -1)
12454 return NULL;
12455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457
Victor Stinnercc7af722013-04-09 22:39:24 +020012458 if (PyUnicode_IS_ASCII(self)) {
12459 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12460
12461 i = 0;
12462 if (striptype != RIGHTSTRIP) {
12463 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012464 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012465 if (!_Py_ascii_whitespace[ch])
12466 break;
12467 i++;
12468 }
12469 }
12470
12471 j = len;
12472 if (striptype != LEFTSTRIP) {
12473 j--;
12474 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012475 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012476 if (!_Py_ascii_whitespace[ch])
12477 break;
12478 j--;
12479 }
12480 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012481 }
12482 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012483 else {
12484 int kind = PyUnicode_KIND(self);
12485 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012486
Victor Stinnercc7af722013-04-09 22:39:24 +020012487 i = 0;
12488 if (striptype != RIGHTSTRIP) {
12489 while (i < len) {
12490 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12491 if (!Py_UNICODE_ISSPACE(ch))
12492 break;
12493 i++;
12494 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012495 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012496
12497 j = len;
12498 if (striptype != LEFTSTRIP) {
12499 j--;
12500 while (j >= i) {
12501 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12502 if (!Py_UNICODE_ISSPACE(ch))
12503 break;
12504 j--;
12505 }
12506 j++;
12507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012509
Victor Stinner7931d9a2011-11-04 00:22:48 +010012510 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511}
12512
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513
12514static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012515do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012517 if (sep != NULL && sep != Py_None) {
12518 if (PyUnicode_Check(sep))
12519 return _PyUnicode_XStrip(self, striptype, sep);
12520 else {
12521 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012522 "%s arg must be None or str",
12523 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012524 return NULL;
12525 }
12526 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012527
Benjamin Peterson14339b62009-01-31 16:36:08 +000012528 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012529}
12530
12531
INADA Naoki3ae20562017-01-16 20:41:20 +090012532/*[clinic input]
12533str.strip as unicode_strip
12534
12535 chars: object = None
12536 /
12537
Victor Stinner0c4a8282017-01-17 02:21:47 +010012538Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012539
12540If chars is given and not None, remove characters in chars instead.
12541[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012542
12543static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012544unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012545/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012546{
INADA Naoki3ae20562017-01-16 20:41:20 +090012547 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012548}
12549
12550
INADA Naoki3ae20562017-01-16 20:41:20 +090012551/*[clinic input]
12552str.lstrip as unicode_lstrip
12553
12554 chars: object = NULL
12555 /
12556
12557Return a copy of the string with leading whitespace removed.
12558
12559If chars is given and not None, remove characters in chars instead.
12560[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012561
12562static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012563unicode_lstrip_impl(PyObject *self, PyObject *chars)
12564/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012565{
INADA Naoki3ae20562017-01-16 20:41:20 +090012566 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012567}
12568
12569
INADA Naoki3ae20562017-01-16 20:41:20 +090012570/*[clinic input]
12571str.rstrip as unicode_rstrip
12572
12573 chars: object = NULL
12574 /
12575
12576Return a copy of the string with trailing whitespace removed.
12577
12578If chars is given and not None, remove characters in chars instead.
12579[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012580
12581static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012582unicode_rstrip_impl(PyObject *self, PyObject *chars)
12583/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012584{
INADA Naoki3ae20562017-01-16 20:41:20 +090012585 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012586}
12587
12588
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012590unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012592 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594
Serhiy Storchaka05997252013-01-26 12:14:02 +020012595 if (len < 1)
12596 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597
Victor Stinnerc4b49542011-12-11 22:44:26 +010012598 /* no repeat, return original string */
12599 if (len == 1)
12600 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012601
Benjamin Petersonbac79492012-01-14 13:34:47 -050012602 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603 return NULL;
12604
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012605 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012606 PyErr_SetString(PyExc_OverflowError,
12607 "repeated string is too long");
12608 return NULL;
12609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012611
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012612 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 if (!u)
12614 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012615 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 if (PyUnicode_GET_LENGTH(str) == 1) {
12618 const int kind = PyUnicode_KIND(str);
12619 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012620 if (kind == PyUnicode_1BYTE_KIND) {
12621 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012622 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012623 }
12624 else if (kind == PyUnicode_2BYTE_KIND) {
12625 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012626 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012627 ucs2[n] = fill_char;
12628 } else {
12629 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12630 assert(kind == PyUnicode_4BYTE_KIND);
12631 for (n = 0; n < len; ++n)
12632 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 }
12635 else {
12636 /* number of characters copied this far */
12637 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012638 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012640 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012644 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012645 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647 }
12648
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012649 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012650 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651}
12652
Alexander Belopolsky40018472011-02-26 01:02:56 +000012653PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012654PyUnicode_Replace(PyObject *str,
12655 PyObject *substr,
12656 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012657 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012659 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12660 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012662 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
INADA Naoki3ae20562017-01-16 20:41:20 +090012665/*[clinic input]
12666str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
INADA Naoki3ae20562017-01-16 20:41:20 +090012668 old: unicode
12669 new: unicode
12670 count: Py_ssize_t = -1
12671 Maximum number of occurrences to replace.
12672 -1 (the default value) means replace all occurrences.
12673 /
12674
12675Return a copy with all occurrences of substring old replaced by new.
12676
12677If the optional argument count is given, only the first count occurrences are
12678replaced.
12679[clinic start generated code]*/
12680
12681static PyObject *
12682unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12683 Py_ssize_t count)
12684/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012686 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012688 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
Alexander Belopolsky40018472011-02-26 01:02:56 +000012691static PyObject *
12692unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012694 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 Py_ssize_t isize;
12696 Py_ssize_t osize, squote, dquote, i, o;
12697 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012698 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012702 return NULL;
12703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 isize = PyUnicode_GET_LENGTH(unicode);
12705 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 /* Compute length of output, quote characters, and
12708 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012709 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 max = 127;
12711 squote = dquote = 0;
12712 ikind = PyUnicode_KIND(unicode);
12713 for (i = 0; i < isize; i++) {
12714 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012715 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012717 case '\'': squote++; break;
12718 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012720 incr = 2;
12721 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 default:
12723 /* Fast-path ASCII */
12724 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012725 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012727 ;
12728 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012731 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012733 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012735 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012737 if (osize > PY_SSIZE_T_MAX - incr) {
12738 PyErr_SetString(PyExc_OverflowError,
12739 "string is too long to generate repr");
12740 return NULL;
12741 }
12742 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 }
12744
12745 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012746 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012748 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 if (dquote)
12750 /* Both squote and dquote present. Use squote,
12751 and escape them */
12752 osize += squote;
12753 else
12754 quote = '"';
12755 }
Victor Stinner55c08782013-04-14 18:45:39 +020012756 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757
12758 repr = PyUnicode_New(osize, max);
12759 if (repr == NULL)
12760 return NULL;
12761 okind = PyUnicode_KIND(repr);
12762 odata = PyUnicode_DATA(repr);
12763
12764 PyUnicode_WRITE(okind, odata, 0, quote);
12765 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012766 if (unchanged) {
12767 _PyUnicode_FastCopyCharacters(repr, 1,
12768 unicode, 0,
12769 isize);
12770 }
12771 else {
12772 for (i = 0, o = 1; i < isize; i++) {
12773 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774
Victor Stinner55c08782013-04-14 18:45:39 +020012775 /* Escape quotes and backslashes */
12776 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012777 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012779 continue;
12780 }
12781
12782 /* Map special whitespace to '\t', \n', '\r' */
12783 if (ch == '\t') {
12784 PyUnicode_WRITE(okind, odata, o++, '\\');
12785 PyUnicode_WRITE(okind, odata, o++, 't');
12786 }
12787 else if (ch == '\n') {
12788 PyUnicode_WRITE(okind, odata, o++, '\\');
12789 PyUnicode_WRITE(okind, odata, o++, 'n');
12790 }
12791 else if (ch == '\r') {
12792 PyUnicode_WRITE(okind, odata, o++, '\\');
12793 PyUnicode_WRITE(okind, odata, o++, 'r');
12794 }
12795
12796 /* Map non-printable US ASCII to '\xhh' */
12797 else if (ch < ' ' || ch == 0x7F) {
12798 PyUnicode_WRITE(okind, odata, o++, '\\');
12799 PyUnicode_WRITE(okind, odata, o++, 'x');
12800 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12801 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12802 }
12803
12804 /* Copy ASCII characters as-is */
12805 else if (ch < 0x7F) {
12806 PyUnicode_WRITE(okind, odata, o++, ch);
12807 }
12808
12809 /* Non-ASCII characters */
12810 else {
12811 /* Map Unicode whitespace and control characters
12812 (categories Z* and C* except ASCII space)
12813 */
12814 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12815 PyUnicode_WRITE(okind, odata, o++, '\\');
12816 /* Map 8-bit characters to '\xhh' */
12817 if (ch <= 0xff) {
12818 PyUnicode_WRITE(okind, odata, o++, 'x');
12819 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12820 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12821 }
12822 /* Map 16-bit characters to '\uxxxx' */
12823 else if (ch <= 0xffff) {
12824 PyUnicode_WRITE(okind, odata, o++, 'u');
12825 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12826 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12827 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12828 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12829 }
12830 /* Map 21-bit characters to '\U00xxxxxx' */
12831 else {
12832 PyUnicode_WRITE(okind, odata, o++, 'U');
12833 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12834 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12836 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12837 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12838 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12839 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12840 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12841 }
12842 }
12843 /* Copy characters as-is */
12844 else {
12845 PyUnicode_WRITE(okind, odata, o++, ch);
12846 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012847 }
12848 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012851 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012852 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853}
12854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012855PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857\n\
12858Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012859such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860arguments start and end are interpreted as in slice notation.\n\
12861\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012862Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863
12864static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012867 /* initialize variables to prevent gcc warning */
12868 PyObject *substring = NULL;
12869 Py_ssize_t start = 0;
12870 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012871 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012873 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012876 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012877 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012879 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 if (result == -2)
12882 return NULL;
12883
Christian Heimes217cfd12007-12-02 14:31:20 +000012884 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012887PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012890Return the highest index in S where substring sub is found,\n\
12891such that sub is contained within S[start:end]. Optional\n\
12892arguments start and end are interpreted as in slice notation.\n\
12893\n\
12894Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895
12896static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012899 /* initialize variables to prevent gcc warning */
12900 PyObject *substring = NULL;
12901 Py_ssize_t start = 0;
12902 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012903 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012905 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012906 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012908 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012911 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 if (result == -2)
12914 return NULL;
12915
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916 if (result < 0) {
12917 PyErr_SetString(PyExc_ValueError, "substring not found");
12918 return NULL;
12919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920
Christian Heimes217cfd12007-12-02 14:31:20 +000012921 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922}
12923
INADA Naoki3ae20562017-01-16 20:41:20 +090012924/*[clinic input]
12925str.rjust as unicode_rjust
12926
12927 width: Py_ssize_t
12928 fillchar: Py_UCS4 = ' '
12929 /
12930
12931Return a right-justified string of length width.
12932
12933Padding is done using the specified fill character (default is a space).
12934[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012935
12936static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012937unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12938/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012940 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941 return NULL;
12942
Victor Stinnerc4b49542011-12-11 22:44:26 +010012943 if (PyUnicode_GET_LENGTH(self) >= width)
12944 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945
Victor Stinnerc4b49542011-12-11 22:44:26 +010012946 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947}
12948
Alexander Belopolsky40018472011-02-26 01:02:56 +000012949PyObject *
12950PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012952 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012955 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956}
12957
INADA Naoki3ae20562017-01-16 20:41:20 +090012958/*[clinic input]
12959str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960
INADA Naoki3ae20562017-01-16 20:41:20 +090012961 sep: object = None
12962 The delimiter according which to split the string.
12963 None (the default value) means split according to any whitespace,
12964 and discard empty strings from the result.
12965 maxsplit: Py_ssize_t = -1
12966 Maximum number of splits to do.
12967 -1 (the default value) means no limit.
12968
12969Return a list of the words in the string, using sep as the delimiter string.
12970[clinic start generated code]*/
12971
12972static PyObject *
12973unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12974/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012975{
INADA Naoki3ae20562017-01-16 20:41:20 +090012976 if (sep == Py_None)
12977 return split(self, NULL, maxsplit);
12978 if (PyUnicode_Check(sep))
12979 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980
Victor Stinner998b8062018-09-12 00:23:25 +020012981 PyErr_Format(PyExc_TypeError,
12982 "must be str or None, not %.100s",
12983 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985}
12986
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012988PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012990 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012991 int kind1, kind2;
12992 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012995 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012997
Victor Stinner14f8f022011-10-05 20:58:25 +020012998 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 len1 = PyUnicode_GET_LENGTH(str_obj);
13001 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013002 if (kind1 < kind2 || len1 < len2) {
13003 _Py_INCREF_UNICODE_EMPTY();
13004 if (!unicode_empty)
13005 out = NULL;
13006 else {
13007 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13008 Py_DECREF(unicode_empty);
13009 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 return out;
13011 }
13012 buf1 = PyUnicode_DATA(str_obj);
13013 buf2 = PyUnicode_DATA(sep_obj);
13014 if (kind2 != kind1) {
13015 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13016 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013017 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013020 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013022 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13023 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024 else
13025 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 break;
13027 case PyUnicode_2BYTE_KIND:
13028 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13029 break;
13030 case PyUnicode_4BYTE_KIND:
13031 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13032 break;
13033 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013034 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013036
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013037 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013039
13040 return out;
13041}
13042
13043
13044PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013045PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013046{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013047 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013048 int kind1, kind2;
13049 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013051
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013052 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013053 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013055 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 len1 = PyUnicode_GET_LENGTH(str_obj);
13058 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013059 if (kind1 < kind2 || len1 < len2) {
13060 _Py_INCREF_UNICODE_EMPTY();
13061 if (!unicode_empty)
13062 out = NULL;
13063 else {
13064 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13065 Py_DECREF(unicode_empty);
13066 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013067 return out;
13068 }
13069 buf1 = PyUnicode_DATA(str_obj);
13070 buf2 = PyUnicode_DATA(sep_obj);
13071 if (kind2 != kind1) {
13072 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13073 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013074 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013077 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013079 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13080 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13081 else
13082 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 break;
13084 case PyUnicode_2BYTE_KIND:
13085 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13086 break;
13087 case PyUnicode_4BYTE_KIND:
13088 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13089 break;
13090 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013091 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013093
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013094 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013095 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013096
13097 return out;
13098}
13099
INADA Naoki3ae20562017-01-16 20:41:20 +090013100/*[clinic input]
13101str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013102
INADA Naoki3ae20562017-01-16 20:41:20 +090013103 sep: object
13104 /
13105
13106Partition the string into three parts using the given separator.
13107
13108This will search for the separator in the string. If the separator is found,
13109returns a 3-tuple containing the part before the separator, the separator
13110itself, and the part after it.
13111
13112If the separator is not found, returns a 3-tuple containing the original string
13113and two empty strings.
13114[clinic start generated code]*/
13115
13116static PyObject *
13117unicode_partition(PyObject *self, PyObject *sep)
13118/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013119{
INADA Naoki3ae20562017-01-16 20:41:20 +090013120 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013121}
13122
INADA Naoki3ae20562017-01-16 20:41:20 +090013123/*[clinic input]
13124str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013125
INADA Naoki3ae20562017-01-16 20:41:20 +090013126Partition the string into three parts using the given separator.
13127
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013128This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013129the separator is found, returns a 3-tuple containing the part before the
13130separator, the separator itself, and the part after it.
13131
13132If the separator is not found, returns a 3-tuple containing two empty strings
13133and the original string.
13134[clinic start generated code]*/
13135
13136static PyObject *
13137unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013138/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013139{
INADA Naoki3ae20562017-01-16 20:41:20 +090013140 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013141}
13142
Alexander Belopolsky40018472011-02-26 01:02:56 +000013143PyObject *
13144PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013145{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013146 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013147 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013148
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013149 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013150}
13151
INADA Naoki3ae20562017-01-16 20:41:20 +090013152/*[clinic input]
13153str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013154
INADA Naoki3ae20562017-01-16 20:41:20 +090013155Return a list of the words in the string, using sep as the delimiter string.
13156
13157Splits are done starting at the end of the string and working to the front.
13158[clinic start generated code]*/
13159
13160static PyObject *
13161unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13162/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013163{
INADA Naoki3ae20562017-01-16 20:41:20 +090013164 if (sep == Py_None)
13165 return rsplit(self, NULL, maxsplit);
13166 if (PyUnicode_Check(sep))
13167 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013168
Victor Stinner998b8062018-09-12 00:23:25 +020013169 PyErr_Format(PyExc_TypeError,
13170 "must be str or None, not %.100s",
13171 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013172 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013173}
13174
INADA Naoki3ae20562017-01-16 20:41:20 +090013175/*[clinic input]
13176str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013178 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013179
13180Return a list of the lines in the string, breaking at line boundaries.
13181
13182Line breaks are not included in the resulting list unless keepends is given and
13183true.
13184[clinic start generated code]*/
13185
13186static PyObject *
13187unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013188/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013190 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191}
13192
13193static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013194PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013196 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197}
13198
INADA Naoki3ae20562017-01-16 20:41:20 +090013199/*[clinic input]
13200str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
INADA Naoki3ae20562017-01-16 20:41:20 +090013202Convert uppercase characters to lowercase and lowercase characters to uppercase.
13203[clinic start generated code]*/
13204
13205static PyObject *
13206unicode_swapcase_impl(PyObject *self)
13207/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013209 if (PyUnicode_READY(self) == -1)
13210 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013211 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212}
13213
Larry Hastings61272b72014-01-07 12:41:53 -080013214/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013215
Larry Hastings31826802013-10-19 00:09:25 -070013216@staticmethod
13217str.maketrans as unicode_maketrans
13218
13219 x: object
13220
13221 y: unicode=NULL
13222
13223 z: unicode=NULL
13224
13225 /
13226
13227Return a translation table usable for str.translate().
13228
13229If there is only one argument, it must be a dictionary mapping Unicode
13230ordinals (integers) or characters to Unicode ordinals, strings or None.
13231Character keys will be then converted to ordinals.
13232If there are two arguments, they must be strings of equal length, and
13233in the resulting dictionary, each character in x will be mapped to the
13234character at the same position in y. If there is a third argument, it
13235must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013236[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013237
Larry Hastings31826802013-10-19 00:09:25 -070013238static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013239unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013240/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013241{
Georg Brandlceee0772007-11-27 23:48:05 +000013242 PyObject *new = NULL, *key, *value;
13243 Py_ssize_t i = 0;
13244 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013245
Georg Brandlceee0772007-11-27 23:48:05 +000013246 new = PyDict_New();
13247 if (!new)
13248 return NULL;
13249 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 int x_kind, y_kind, z_kind;
13251 void *x_data, *y_data, *z_data;
13252
Georg Brandlceee0772007-11-27 23:48:05 +000013253 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013254 if (!PyUnicode_Check(x)) {
13255 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13256 "be a string if there is a second argument");
13257 goto err;
13258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013260 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13261 "arguments must have equal length");
13262 goto err;
13263 }
13264 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 x_kind = PyUnicode_KIND(x);
13266 y_kind = PyUnicode_KIND(y);
13267 x_data = PyUnicode_DATA(x);
13268 y_data = PyUnicode_DATA(y);
13269 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13270 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013271 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013272 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013273 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013274 if (!value) {
13275 Py_DECREF(key);
13276 goto err;
13277 }
Georg Brandlceee0772007-11-27 23:48:05 +000013278 res = PyDict_SetItem(new, key, value);
13279 Py_DECREF(key);
13280 Py_DECREF(value);
13281 if (res < 0)
13282 goto err;
13283 }
13284 /* create entries for deleting chars in z */
13285 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 z_kind = PyUnicode_KIND(z);
13287 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013288 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013290 if (!key)
13291 goto err;
13292 res = PyDict_SetItem(new, key, Py_None);
13293 Py_DECREF(key);
13294 if (res < 0)
13295 goto err;
13296 }
13297 }
13298 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 int kind;
13300 void *data;
13301
Georg Brandlceee0772007-11-27 23:48:05 +000013302 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013303 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013304 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13305 "to maketrans it must be a dict");
13306 goto err;
13307 }
13308 /* copy entries into the new dict, converting string keys to int keys */
13309 while (PyDict_Next(x, &i, &key, &value)) {
13310 if (PyUnicode_Check(key)) {
13311 /* convert string keys to integer keys */
13312 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013313 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013314 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13315 "table must be of length 1");
13316 goto err;
13317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 kind = PyUnicode_KIND(key);
13319 data = PyUnicode_DATA(key);
13320 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013321 if (!newkey)
13322 goto err;
13323 res = PyDict_SetItem(new, newkey, value);
13324 Py_DECREF(newkey);
13325 if (res < 0)
13326 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013327 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013328 /* just keep integer keys */
13329 if (PyDict_SetItem(new, key, value) < 0)
13330 goto err;
13331 } else {
13332 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13333 "be strings or integers");
13334 goto err;
13335 }
13336 }
13337 }
13338 return new;
13339 err:
13340 Py_DECREF(new);
13341 return NULL;
13342}
13343
INADA Naoki3ae20562017-01-16 20:41:20 +090013344/*[clinic input]
13345str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
INADA Naoki3ae20562017-01-16 20:41:20 +090013347 table: object
13348 Translation table, which must be a mapping of Unicode ordinals to
13349 Unicode ordinals, strings, or None.
13350 /
13351
13352Replace each character in the string using the given translation table.
13353
13354The table must implement lookup/indexing via __getitem__, for instance a
13355dictionary or list. If this operation raises LookupError, the character is
13356left untouched. Characters mapped to None are deleted.
13357[clinic start generated code]*/
13358
13359static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013360unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013361/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364}
13365
INADA Naoki3ae20562017-01-16 20:41:20 +090013366/*[clinic input]
13367str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368
INADA Naoki3ae20562017-01-16 20:41:20 +090013369Return a copy of the string converted to uppercase.
13370[clinic start generated code]*/
13371
13372static PyObject *
13373unicode_upper_impl(PyObject *self)
13374/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013376 if (PyUnicode_READY(self) == -1)
13377 return NULL;
13378 if (PyUnicode_IS_ASCII(self))
13379 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013380 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381}
13382
INADA Naoki3ae20562017-01-16 20:41:20 +090013383/*[clinic input]
13384str.zfill as unicode_zfill
13385
13386 width: Py_ssize_t
13387 /
13388
13389Pad a numeric string with zeros on the left, to fill a field of the given width.
13390
13391The string is never truncated.
13392[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013393
13394static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013395unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013396/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013398 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013399 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 int kind;
13401 void *data;
13402 Py_UCS4 chr;
13403
Benjamin Petersonbac79492012-01-14 13:34:47 -050013404 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406
Victor Stinnerc4b49542011-12-11 22:44:26 +010013407 if (PyUnicode_GET_LENGTH(self) >= width)
13408 return unicode_result_unchanged(self);
13409
13410 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411
13412 u = pad(self, fill, 0, '0');
13413
Walter Dörwald068325e2002-04-15 13:36:47 +000013414 if (u == NULL)
13415 return NULL;
13416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013417 kind = PyUnicode_KIND(u);
13418 data = PyUnicode_DATA(u);
13419 chr = PyUnicode_READ(kind, data, fill);
13420
13421 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013423 PyUnicode_WRITE(kind, data, 0, chr);
13424 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425 }
13426
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013427 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013428 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430
13431#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013432static PyObject *
13433unicode__decimal2ascii(PyObject *self)
13434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013436}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437#endif
13438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013439PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013442Return True if S starts with the specified prefix, False otherwise.\n\
13443With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013444With optional end, stop comparing S at that position.\n\
13445prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446
13447static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013448unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013452 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013453 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013454 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013455 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456
Jesus Ceaac451502011-04-20 17:09:23 +020013457 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013459 if (PyTuple_Check(subobj)) {
13460 Py_ssize_t i;
13461 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013462 substring = PyTuple_GET_ITEM(subobj, i);
13463 if (!PyUnicode_Check(substring)) {
13464 PyErr_Format(PyExc_TypeError,
13465 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013466 "not %.100s",
13467 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013468 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013469 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013470 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013471 if (result == -1)
13472 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013473 if (result) {
13474 Py_RETURN_TRUE;
13475 }
13476 }
13477 /* nothing matched */
13478 Py_RETURN_FALSE;
13479 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013480 if (!PyUnicode_Check(subobj)) {
13481 PyErr_Format(PyExc_TypeError,
13482 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013483 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013484 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013485 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013486 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013487 if (result == -1)
13488 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013489 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013490}
13491
13492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013493PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013495\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013496Return True if S ends with the specified suffix, False otherwise.\n\
13497With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013498With optional end, stop comparing S at that position.\n\
13499suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013500
13501static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013502unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013505 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013506 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013507 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013508 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013509 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510
Jesus Ceaac451502011-04-20 17:09:23 +020013511 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013513 if (PyTuple_Check(subobj)) {
13514 Py_ssize_t i;
13515 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013516 substring = PyTuple_GET_ITEM(subobj, i);
13517 if (!PyUnicode_Check(substring)) {
13518 PyErr_Format(PyExc_TypeError,
13519 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013520 "not %.100s",
13521 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013523 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013524 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013525 if (result == -1)
13526 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013527 if (result) {
13528 Py_RETURN_TRUE;
13529 }
13530 }
13531 Py_RETURN_FALSE;
13532 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013533 if (!PyUnicode_Check(subobj)) {
13534 PyErr_Format(PyExc_TypeError,
13535 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013536 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013537 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013538 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013539 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013540 if (result == -1)
13541 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013542 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013543}
13544
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013545static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013546_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013547{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013548 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13549 writer->data = PyUnicode_DATA(writer->buffer);
13550
13551 if (!writer->readonly) {
13552 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013553 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013554 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013555 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013556 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13557 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13558 writer->kind = PyUnicode_WCHAR_KIND;
13559 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13560
Victor Stinner8f674cc2013-04-17 23:02:17 +020013561 /* Copy-on-write mode: set buffer size to 0 so
13562 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13563 * next write. */
13564 writer->size = 0;
13565 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013566}
13567
Victor Stinnerd3f08822012-05-29 12:57:52 +020013568void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013569_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013570{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013571 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013572
13573 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013574 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013575
13576 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13577 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13578 writer->kind = PyUnicode_WCHAR_KIND;
13579 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013580}
13581
Inada Naoki770847a2019-06-24 12:30:24 +090013582// Initialize _PyUnicodeWriter with initial buffer
13583static inline void
13584_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13585{
13586 memset(writer, 0, sizeof(*writer));
13587 writer->buffer = buffer;
13588 _PyUnicodeWriter_Update(writer);
13589 writer->min_length = writer->size;
13590}
13591
Victor Stinnerd3f08822012-05-29 12:57:52 +020013592int
13593_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13594 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013595{
13596 Py_ssize_t newlen;
13597 PyObject *newbuffer;
13598
Victor Stinner2740e462016-09-06 16:58:36 -070013599 assert(maxchar <= MAX_UNICODE);
13600
Victor Stinnerca9381e2015-09-22 00:58:32 +020013601 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013602 assert((maxchar > writer->maxchar && length >= 0)
13603 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013604
Victor Stinner202fdca2012-05-07 12:47:02 +020013605 if (length > PY_SSIZE_T_MAX - writer->pos) {
13606 PyErr_NoMemory();
13607 return -1;
13608 }
13609 newlen = writer->pos + length;
13610
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013611 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013612
Victor Stinnerd3f08822012-05-29 12:57:52 +020013613 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013614 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013615 if (writer->overallocate
13616 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13617 /* overallocate to limit the number of realloc() */
13618 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013619 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013620 if (newlen < writer->min_length)
13621 newlen = writer->min_length;
13622
Victor Stinnerd3f08822012-05-29 12:57:52 +020013623 writer->buffer = PyUnicode_New(newlen, maxchar);
13624 if (writer->buffer == NULL)
13625 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013626 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013627 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013628 if (writer->overallocate
13629 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13630 /* overallocate to limit the number of realloc() */
13631 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013632 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013633 if (newlen < writer->min_length)
13634 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013635
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013636 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013637 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013638 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013639 newbuffer = PyUnicode_New(newlen, maxchar);
13640 if (newbuffer == NULL)
13641 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013642 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13643 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013644 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013645 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013646 }
13647 else {
13648 newbuffer = resize_compact(writer->buffer, newlen);
13649 if (newbuffer == NULL)
13650 return -1;
13651 }
13652 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013653 }
13654 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013655 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013656 newbuffer = PyUnicode_New(writer->size, maxchar);
13657 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013658 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013659 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13660 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013661 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013662 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013663 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013664 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013665
13666#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013667}
13668
Victor Stinnerca9381e2015-09-22 00:58:32 +020013669int
13670_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13671 enum PyUnicode_Kind kind)
13672{
13673 Py_UCS4 maxchar;
13674
13675 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13676 assert(writer->kind < kind);
13677
13678 switch (kind)
13679 {
13680 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13681 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13682 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13683 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013684 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013685 }
13686
13687 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13688}
13689
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013690static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013691_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013692{
Victor Stinner2740e462016-09-06 16:58:36 -070013693 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013694 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13695 return -1;
13696 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13697 writer->pos++;
13698 return 0;
13699}
13700
13701int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013702_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13703{
13704 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13705}
13706
13707int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013708_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13709{
13710 Py_UCS4 maxchar;
13711 Py_ssize_t len;
13712
13713 if (PyUnicode_READY(str) == -1)
13714 return -1;
13715 len = PyUnicode_GET_LENGTH(str);
13716 if (len == 0)
13717 return 0;
13718 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13719 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013720 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013721 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013722 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013723 Py_INCREF(str);
13724 writer->buffer = str;
13725 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013726 writer->pos += len;
13727 return 0;
13728 }
13729 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13730 return -1;
13731 }
13732 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13733 str, 0, len);
13734 writer->pos += len;
13735 return 0;
13736}
13737
Victor Stinnere215d962012-10-06 23:03:36 +020013738int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013739_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13740 Py_ssize_t start, Py_ssize_t end)
13741{
13742 Py_UCS4 maxchar;
13743 Py_ssize_t len;
13744
13745 if (PyUnicode_READY(str) == -1)
13746 return -1;
13747
13748 assert(0 <= start);
13749 assert(end <= PyUnicode_GET_LENGTH(str));
13750 assert(start <= end);
13751
13752 if (end == 0)
13753 return 0;
13754
13755 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13756 return _PyUnicodeWriter_WriteStr(writer, str);
13757
13758 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13759 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13760 else
13761 maxchar = writer->maxchar;
13762 len = end - start;
13763
13764 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13765 return -1;
13766
13767 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13768 str, start, len);
13769 writer->pos += len;
13770 return 0;
13771}
13772
13773int
Victor Stinner4a587072013-11-19 12:54:53 +010013774_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13775 const char *ascii, Py_ssize_t len)
13776{
13777 if (len == -1)
13778 len = strlen(ascii);
13779
13780 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13781
13782 if (writer->buffer == NULL && !writer->overallocate) {
13783 PyObject *str;
13784
13785 str = _PyUnicode_FromASCII(ascii, len);
13786 if (str == NULL)
13787 return -1;
13788
13789 writer->readonly = 1;
13790 writer->buffer = str;
13791 _PyUnicodeWriter_Update(writer);
13792 writer->pos += len;
13793 return 0;
13794 }
13795
13796 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13797 return -1;
13798
13799 switch (writer->kind)
13800 {
13801 case PyUnicode_1BYTE_KIND:
13802 {
13803 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13804 Py_UCS1 *data = writer->data;
13805
Christian Heimesf051e432016-09-13 20:22:02 +020013806 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013807 break;
13808 }
13809 case PyUnicode_2BYTE_KIND:
13810 {
13811 _PyUnicode_CONVERT_BYTES(
13812 Py_UCS1, Py_UCS2,
13813 ascii, ascii + len,
13814 (Py_UCS2 *)writer->data + writer->pos);
13815 break;
13816 }
13817 case PyUnicode_4BYTE_KIND:
13818 {
13819 _PyUnicode_CONVERT_BYTES(
13820 Py_UCS1, Py_UCS4,
13821 ascii, ascii + len,
13822 (Py_UCS4 *)writer->data + writer->pos);
13823 break;
13824 }
13825 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013826 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013827 }
13828
13829 writer->pos += len;
13830 return 0;
13831}
13832
13833int
13834_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13835 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013836{
13837 Py_UCS4 maxchar;
13838
13839 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13840 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13841 return -1;
13842 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13843 writer->pos += len;
13844 return 0;
13845}
13846
Victor Stinnerd3f08822012-05-29 12:57:52 +020013847PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013848_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013849{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013850 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013851
Victor Stinnerd3f08822012-05-29 12:57:52 +020013852 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013853 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013854 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013856
13857 str = writer->buffer;
13858 writer->buffer = NULL;
13859
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013860 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013861 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13862 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013864
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013865 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13866 PyObject *str2;
13867 str2 = resize_compact(str, writer->pos);
13868 if (str2 == NULL) {
13869 Py_DECREF(str);
13870 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013871 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013872 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013873 }
13874
Victor Stinner15a0bd32013-07-08 22:29:55 +020013875 assert(_PyUnicode_CheckConsistency(str, 1));
13876 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013877}
13878
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013880_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013881{
13882 Py_CLEAR(writer->buffer);
13883}
13884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013886
13887PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013889\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013890Return a formatted version of S, using substitutions from args and kwargs.\n\
13891The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013892
Eric Smith27bbca62010-11-04 17:06:58 +000013893PyDoc_STRVAR(format_map__doc__,
13894 "S.format_map(mapping) -> str\n\
13895\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013896Return a formatted version of S, using substitutions from mapping.\n\
13897The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013898
INADA Naoki3ae20562017-01-16 20:41:20 +090013899/*[clinic input]
13900str.__format__ as unicode___format__
13901
13902 format_spec: unicode
13903 /
13904
13905Return a formatted version of the string as described by format_spec.
13906[clinic start generated code]*/
13907
Eric Smith4a7d76d2008-05-30 18:10:19 +000013908static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013909unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013910/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013911{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013912 _PyUnicodeWriter writer;
13913 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013914
Victor Stinnerd3f08822012-05-29 12:57:52 +020013915 if (PyUnicode_READY(self) == -1)
13916 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013917 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013918 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13919 self, format_spec, 0,
13920 PyUnicode_GET_LENGTH(format_spec));
13921 if (ret == -1) {
13922 _PyUnicodeWriter_Dealloc(&writer);
13923 return NULL;
13924 }
13925 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013926}
13927
INADA Naoki3ae20562017-01-16 20:41:20 +090013928/*[clinic input]
13929str.__sizeof__ as unicode_sizeof
13930
13931Return the size of the string in memory, in bytes.
13932[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013933
13934static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013935unicode_sizeof_impl(PyObject *self)
13936/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013938 Py_ssize_t size;
13939
13940 /* If it's a compact object, account for base structure +
13941 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013942 if (PyUnicode_IS_COMPACT_ASCII(self))
13943 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13944 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013946 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 else {
13948 /* If it is a two-block object, account for base object, and
13949 for character block if present. */
13950 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013951 if (_PyUnicode_DATA_ANY(self))
13952 size += (PyUnicode_GET_LENGTH(self) + 1) *
13953 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 }
13955 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013956 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013957 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13958 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13959 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13960 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013961
13962 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013963}
13964
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013965static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013966unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013967{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013968 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013969 if (!copy)
13970 return NULL;
13971 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013972}
13973
Guido van Rossumd57fd912000-03-10 22:53:23 +000013974static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013975 UNICODE_ENCODE_METHODDEF
13976 UNICODE_REPLACE_METHODDEF
13977 UNICODE_SPLIT_METHODDEF
13978 UNICODE_RSPLIT_METHODDEF
13979 UNICODE_JOIN_METHODDEF
13980 UNICODE_CAPITALIZE_METHODDEF
13981 UNICODE_CASEFOLD_METHODDEF
13982 UNICODE_TITLE_METHODDEF
13983 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013984 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013985 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013986 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013987 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013988 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013989 UNICODE_LJUST_METHODDEF
13990 UNICODE_LOWER_METHODDEF
13991 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013992 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13993 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013994 UNICODE_RJUST_METHODDEF
13995 UNICODE_RSTRIP_METHODDEF
13996 UNICODE_RPARTITION_METHODDEF
13997 UNICODE_SPLITLINES_METHODDEF
13998 UNICODE_STRIP_METHODDEF
13999 UNICODE_SWAPCASE_METHODDEF
14000 UNICODE_TRANSLATE_METHODDEF
14001 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014002 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14003 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014004 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014005 UNICODE_ISLOWER_METHODDEF
14006 UNICODE_ISUPPER_METHODDEF
14007 UNICODE_ISTITLE_METHODDEF
14008 UNICODE_ISSPACE_METHODDEF
14009 UNICODE_ISDECIMAL_METHODDEF
14010 UNICODE_ISDIGIT_METHODDEF
14011 UNICODE_ISNUMERIC_METHODDEF
14012 UNICODE_ISALPHA_METHODDEF
14013 UNICODE_ISALNUM_METHODDEF
14014 UNICODE_ISIDENTIFIER_METHODDEF
14015 UNICODE_ISPRINTABLE_METHODDEF
14016 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014017 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014018 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014019 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014020 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014021 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014022#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014023 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014024 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014025#endif
14026
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014027 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014028 {NULL, NULL}
14029};
14030
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014031static PyObject *
14032unicode_mod(PyObject *v, PyObject *w)
14033{
Brian Curtindfc80e32011-08-10 20:28:54 -050014034 if (!PyUnicode_Check(v))
14035 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014036 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014037}
14038
14039static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 0, /*nb_add*/
14041 0, /*nb_subtract*/
14042 0, /*nb_multiply*/
14043 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014044};
14045
Guido van Rossumd57fd912000-03-10 22:53:23 +000014046static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 (lenfunc) unicode_length, /* sq_length */
14048 PyUnicode_Concat, /* sq_concat */
14049 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14050 (ssizeargfunc) unicode_getitem, /* sq_item */
14051 0, /* sq_slice */
14052 0, /* sq_ass_item */
14053 0, /* sq_ass_slice */
14054 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014055};
14056
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014057static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014058unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014060 if (PyUnicode_READY(self) == -1)
14061 return NULL;
14062
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014063 if (PyIndex_Check(item)) {
14064 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014065 if (i == -1 && PyErr_Occurred())
14066 return NULL;
14067 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014068 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014069 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014070 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014071 Py_ssize_t start, stop, step, slicelength, i;
14072 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014073 PyObject *result;
14074 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014075 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014076 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014077
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014078 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014079 return NULL;
14080 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014081 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14082 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014083
14084 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014085 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014086 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014087 slicelength == PyUnicode_GET_LENGTH(self)) {
14088 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014089 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014090 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014091 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014092 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014093 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014094 src_kind = PyUnicode_KIND(self);
14095 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014096 if (!PyUnicode_IS_ASCII(self)) {
14097 kind_limit = kind_maxchar_limit(src_kind);
14098 max_char = 0;
14099 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14100 ch = PyUnicode_READ(src_kind, src_data, cur);
14101 if (ch > max_char) {
14102 max_char = ch;
14103 if (max_char >= kind_limit)
14104 break;
14105 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014106 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014107 }
Victor Stinner55c99112011-10-13 01:17:06 +020014108 else
14109 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014110 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014111 if (result == NULL)
14112 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014113 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014114 dest_data = PyUnicode_DATA(result);
14115
14116 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014117 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14118 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014119 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014120 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014121 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014122 } else {
14123 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14124 return NULL;
14125 }
14126}
14127
14128static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 (lenfunc)unicode_length, /* mp_length */
14130 (binaryfunc)unicode_subscript, /* mp_subscript */
14131 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014132};
14133
Guido van Rossumd57fd912000-03-10 22:53:23 +000014134
Guido van Rossumd57fd912000-03-10 22:53:23 +000014135/* Helpers for PyUnicode_Format() */
14136
Victor Stinnera47082312012-10-04 02:19:54 +020014137struct unicode_formatter_t {
14138 PyObject *args;
14139 int args_owned;
14140 Py_ssize_t arglen, argidx;
14141 PyObject *dict;
14142
14143 enum PyUnicode_Kind fmtkind;
14144 Py_ssize_t fmtcnt, fmtpos;
14145 void *fmtdata;
14146 PyObject *fmtstr;
14147
14148 _PyUnicodeWriter writer;
14149};
14150
14151struct unicode_format_arg_t {
14152 Py_UCS4 ch;
14153 int flags;
14154 Py_ssize_t width;
14155 int prec;
14156 int sign;
14157};
14158
Guido van Rossumd57fd912000-03-10 22:53:23 +000014159static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014160unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014161{
Victor Stinnera47082312012-10-04 02:19:54 +020014162 Py_ssize_t argidx = ctx->argidx;
14163
14164 if (argidx < ctx->arglen) {
14165 ctx->argidx++;
14166 if (ctx->arglen < 0)
14167 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014168 else
Victor Stinnera47082312012-10-04 02:19:54 +020014169 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014170 }
14171 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014172 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014173 return NULL;
14174}
14175
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014176/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014177
Victor Stinnera47082312012-10-04 02:19:54 +020014178/* Format a float into the writer if the writer is not NULL, or into *p_output
14179 otherwise.
14180
14181 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014182static int
Victor Stinnera47082312012-10-04 02:19:54 +020014183formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14184 PyObject **p_output,
14185 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014187 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014188 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014189 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014190 int prec;
14191 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014192
Guido van Rossumd57fd912000-03-10 22:53:23 +000014193 x = PyFloat_AsDouble(v);
14194 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014195 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014196
Victor Stinnera47082312012-10-04 02:19:54 +020014197 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014198 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014199 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014200
Victor Stinnera47082312012-10-04 02:19:54 +020014201 if (arg->flags & F_ALT)
14202 dtoa_flags = Py_DTSF_ALT;
14203 else
14204 dtoa_flags = 0;
14205 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014206 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014207 return -1;
14208 len = strlen(p);
14209 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014210 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014211 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014212 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014213 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014214 }
14215 else
14216 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014217 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014218 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014219}
14220
Victor Stinnerd0880d52012-04-27 23:40:13 +020014221/* formatlong() emulates the format codes d, u, o, x and X, and
14222 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14223 * Python's regular ints.
14224 * Return value: a new PyUnicodeObject*, or NULL if error.
14225 * The output string is of the form
14226 * "-"? ("0x" | "0X")? digit+
14227 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14228 * set in flags. The case of hex digits will be correct,
14229 * There will be at least prec digits, zero-filled on the left if
14230 * necessary to get that many.
14231 * val object to be converted
14232 * flags bitmask of format flags; only F_ALT is looked at
14233 * prec minimum number of digits; 0-fill on left if needed
14234 * type a character in [duoxX]; u acts the same as d
14235 *
14236 * CAUTION: o, x and X conversions on regular ints can never
14237 * produce a '-' sign, but can for Python's unbounded ints.
14238 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014239PyObject *
14240_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014241{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014242 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014244 Py_ssize_t i;
14245 int sign; /* 1 if '-', else 0 */
14246 int len; /* number of characters */
14247 Py_ssize_t llen;
14248 int numdigits; /* len == numnondigits + numdigits */
14249 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014250
Victor Stinnerd0880d52012-04-27 23:40:13 +020014251 /* Avoid exceeding SSIZE_T_MAX */
14252 if (prec > INT_MAX-3) {
14253 PyErr_SetString(PyExc_OverflowError,
14254 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014256 }
14257
14258 assert(PyLong_Check(val));
14259
14260 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014261 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014262 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014263 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014264 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014265 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014266 /* int and int subclasses should print numerically when a numeric */
14267 /* format code is used (see issue18780) */
14268 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014269 break;
14270 case 'o':
14271 numnondigits = 2;
14272 result = PyNumber_ToBase(val, 8);
14273 break;
14274 case 'x':
14275 case 'X':
14276 numnondigits = 2;
14277 result = PyNumber_ToBase(val, 16);
14278 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014279 }
14280 if (!result)
14281 return NULL;
14282
14283 assert(unicode_modifiable(result));
14284 assert(PyUnicode_IS_READY(result));
14285 assert(PyUnicode_IS_ASCII(result));
14286
14287 /* To modify the string in-place, there can only be one reference. */
14288 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014289 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014290 PyErr_BadInternalCall();
14291 return NULL;
14292 }
14293 buf = PyUnicode_DATA(result);
14294 llen = PyUnicode_GET_LENGTH(result);
14295 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014296 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014297 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014298 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014299 return NULL;
14300 }
14301 len = (int)llen;
14302 sign = buf[0] == '-';
14303 numnondigits += sign;
14304 numdigits = len - numnondigits;
14305 assert(numdigits > 0);
14306
14307 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014308 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014309 (type == 'o' || type == 'x' || type == 'X'))) {
14310 assert(buf[sign] == '0');
14311 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14312 buf[sign+1] == 'o');
14313 numnondigits -= 2;
14314 buf += 2;
14315 len -= 2;
14316 if (sign)
14317 buf[0] = '-';
14318 assert(len == numnondigits + numdigits);
14319 assert(numdigits > 0);
14320 }
14321
14322 /* Fill with leading zeroes to meet minimum width. */
14323 if (prec > numdigits) {
14324 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14325 numnondigits + prec);
14326 char *b1;
14327 if (!r1) {
14328 Py_DECREF(result);
14329 return NULL;
14330 }
14331 b1 = PyBytes_AS_STRING(r1);
14332 for (i = 0; i < numnondigits; ++i)
14333 *b1++ = *buf++;
14334 for (i = 0; i < prec - numdigits; i++)
14335 *b1++ = '0';
14336 for (i = 0; i < numdigits; i++)
14337 *b1++ = *buf++;
14338 *b1 = '\0';
14339 Py_DECREF(result);
14340 result = r1;
14341 buf = PyBytes_AS_STRING(result);
14342 len = numnondigits + prec;
14343 }
14344
14345 /* Fix up case for hex conversions. */
14346 if (type == 'X') {
14347 /* Need to convert all lower case letters to upper case.
14348 and need to convert 0x to 0X (and -0x to -0X). */
14349 for (i = 0; i < len; i++)
14350 if (buf[i] >= 'a' && buf[i] <= 'x')
14351 buf[i] -= 'a'-'A';
14352 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014353 if (!PyUnicode_Check(result)
14354 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014355 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014356 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014357 Py_DECREF(result);
14358 result = unicode;
14359 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014360 else if (len != PyUnicode_GET_LENGTH(result)) {
14361 if (PyUnicode_Resize(&result, len) < 0)
14362 Py_CLEAR(result);
14363 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014365}
14366
Ethan Furmandf3ed242014-01-05 06:50:30 -080014367/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014369 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014370 * -1 and raise an exception on error */
14371static int
Victor Stinnera47082312012-10-04 02:19:54 +020014372mainformatlong(PyObject *v,
14373 struct unicode_format_arg_t *arg,
14374 PyObject **p_output,
14375 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014376{
14377 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014378 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379
14380 if (!PyNumber_Check(v))
14381 goto wrongtype;
14382
Ethan Furman9ab74802014-03-21 06:38:46 -070014383 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014384 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014385 if (type == 'o' || type == 'x' || type == 'X') {
14386 iobj = PyNumber_Index(v);
14387 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014388 if (PyErr_ExceptionMatches(PyExc_TypeError))
14389 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014390 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014391 }
14392 }
14393 else {
14394 iobj = PyNumber_Long(v);
14395 if (iobj == NULL ) {
14396 if (PyErr_ExceptionMatches(PyExc_TypeError))
14397 goto wrongtype;
14398 return -1;
14399 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014400 }
14401 assert(PyLong_Check(iobj));
14402 }
14403 else {
14404 iobj = v;
14405 Py_INCREF(iobj);
14406 }
14407
14408 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014409 && arg->width == -1 && arg->prec == -1
14410 && !(arg->flags & (F_SIGN | F_BLANK))
14411 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014412 {
14413 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014414 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014415 int base;
14416
Victor Stinnera47082312012-10-04 02:19:54 +020014417 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014418 {
14419 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014420 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014421 case 'd':
14422 case 'i':
14423 case 'u':
14424 base = 10;
14425 break;
14426 case 'o':
14427 base = 8;
14428 break;
14429 case 'x':
14430 case 'X':
14431 base = 16;
14432 break;
14433 }
14434
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014435 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14436 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014437 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014438 }
14439 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014440 return 1;
14441 }
14442
Ethan Furmanb95b5612015-01-23 20:05:18 -080014443 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014444 Py_DECREF(iobj);
14445 if (res == NULL)
14446 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014447 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014448 return 0;
14449
14450wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014451 switch(type)
14452 {
14453 case 'o':
14454 case 'x':
14455 case 'X':
14456 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014457 "%%%c format: an integer is required, "
14458 "not %.200s",
14459 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014460 break;
14461 default:
14462 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014463 "%%%c format: a number is required, "
14464 "not %.200s",
14465 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014466 break;
14467 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014468 return -1;
14469}
14470
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014471static Py_UCS4
14472formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014473{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014474 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014475 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014476 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014477 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014478 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014479 goto onError;
14480 }
14481 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014482 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014483 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014484 /* make sure number is a type of integer */
14485 if (!PyLong_Check(v)) {
14486 iobj = PyNumber_Index(v);
14487 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014488 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014489 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014490 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014491 Py_DECREF(iobj);
14492 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014493 else {
14494 x = PyLong_AsLong(v);
14495 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014496 if (x == -1 && PyErr_Occurred())
14497 goto onError;
14498
Victor Stinner8faf8212011-12-08 22:14:11 +010014499 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014500 PyErr_SetString(PyExc_OverflowError,
14501 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014502 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014503 }
14504
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014505 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014506 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014507
Benjamin Peterson29060642009-01-31 22:14:21 +000014508 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014509 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014510 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014511 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014512}
14513
Victor Stinnera47082312012-10-04 02:19:54 +020014514/* Parse options of an argument: flags, width, precision.
14515 Handle also "%(name)" syntax.
14516
14517 Return 0 if the argument has been formatted into arg->str.
14518 Return 1 if the argument has been written into ctx->writer,
14519 Raise an exception and return -1 on error. */
14520static int
14521unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14522 struct unicode_format_arg_t *arg)
14523{
14524#define FORMAT_READ(ctx) \
14525 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14526
14527 PyObject *v;
14528
Victor Stinnera47082312012-10-04 02:19:54 +020014529 if (arg->ch == '(') {
14530 /* Get argument value from a dictionary. Example: "%(name)s". */
14531 Py_ssize_t keystart;
14532 Py_ssize_t keylen;
14533 PyObject *key;
14534 int pcount = 1;
14535
14536 if (ctx->dict == NULL) {
14537 PyErr_SetString(PyExc_TypeError,
14538 "format requires a mapping");
14539 return -1;
14540 }
14541 ++ctx->fmtpos;
14542 --ctx->fmtcnt;
14543 keystart = ctx->fmtpos;
14544 /* Skip over balanced parentheses */
14545 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 if (arg->ch == ')')
14548 --pcount;
14549 else if (arg->ch == '(')
14550 ++pcount;
14551 ctx->fmtpos++;
14552 }
14553 keylen = ctx->fmtpos - keystart - 1;
14554 if (ctx->fmtcnt < 0 || pcount > 0) {
14555 PyErr_SetString(PyExc_ValueError,
14556 "incomplete format key");
14557 return -1;
14558 }
14559 key = PyUnicode_Substring(ctx->fmtstr,
14560 keystart, keystart + keylen);
14561 if (key == NULL)
14562 return -1;
14563 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014564 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014565 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014566 }
14567 ctx->args = PyObject_GetItem(ctx->dict, key);
14568 Py_DECREF(key);
14569 if (ctx->args == NULL)
14570 return -1;
14571 ctx->args_owned = 1;
14572 ctx->arglen = -1;
14573 ctx->argidx = -2;
14574 }
14575
14576 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014577 while (--ctx->fmtcnt >= 0) {
14578 arg->ch = FORMAT_READ(ctx);
14579 ctx->fmtpos++;
14580 switch (arg->ch) {
14581 case '-': arg->flags |= F_LJUST; continue;
14582 case '+': arg->flags |= F_SIGN; continue;
14583 case ' ': arg->flags |= F_BLANK; continue;
14584 case '#': arg->flags |= F_ALT; continue;
14585 case '0': arg->flags |= F_ZERO; continue;
14586 }
14587 break;
14588 }
14589
14590 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014591 if (arg->ch == '*') {
14592 v = unicode_format_getnextarg(ctx);
14593 if (v == NULL)
14594 return -1;
14595 if (!PyLong_Check(v)) {
14596 PyErr_SetString(PyExc_TypeError,
14597 "* wants int");
14598 return -1;
14599 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014600 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014601 if (arg->width == -1 && PyErr_Occurred())
14602 return -1;
14603 if (arg->width < 0) {
14604 arg->flags |= F_LJUST;
14605 arg->width = -arg->width;
14606 }
14607 if (--ctx->fmtcnt >= 0) {
14608 arg->ch = FORMAT_READ(ctx);
14609 ctx->fmtpos++;
14610 }
14611 }
14612 else if (arg->ch >= '0' && arg->ch <= '9') {
14613 arg->width = arg->ch - '0';
14614 while (--ctx->fmtcnt >= 0) {
14615 arg->ch = FORMAT_READ(ctx);
14616 ctx->fmtpos++;
14617 if (arg->ch < '0' || arg->ch > '9')
14618 break;
14619 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14620 mixing signed and unsigned comparison. Since arg->ch is between
14621 '0' and '9', casting to int is safe. */
14622 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14623 PyErr_SetString(PyExc_ValueError,
14624 "width too big");
14625 return -1;
14626 }
14627 arg->width = arg->width*10 + (arg->ch - '0');
14628 }
14629 }
14630
14631 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014632 if (arg->ch == '.') {
14633 arg->prec = 0;
14634 if (--ctx->fmtcnt >= 0) {
14635 arg->ch = FORMAT_READ(ctx);
14636 ctx->fmtpos++;
14637 }
14638 if (arg->ch == '*') {
14639 v = unicode_format_getnextarg(ctx);
14640 if (v == NULL)
14641 return -1;
14642 if (!PyLong_Check(v)) {
14643 PyErr_SetString(PyExc_TypeError,
14644 "* wants int");
14645 return -1;
14646 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014647 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014648 if (arg->prec == -1 && PyErr_Occurred())
14649 return -1;
14650 if (arg->prec < 0)
14651 arg->prec = 0;
14652 if (--ctx->fmtcnt >= 0) {
14653 arg->ch = FORMAT_READ(ctx);
14654 ctx->fmtpos++;
14655 }
14656 }
14657 else if (arg->ch >= '0' && arg->ch <= '9') {
14658 arg->prec = arg->ch - '0';
14659 while (--ctx->fmtcnt >= 0) {
14660 arg->ch = FORMAT_READ(ctx);
14661 ctx->fmtpos++;
14662 if (arg->ch < '0' || arg->ch > '9')
14663 break;
14664 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14665 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014666 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014667 return -1;
14668 }
14669 arg->prec = arg->prec*10 + (arg->ch - '0');
14670 }
14671 }
14672 }
14673
14674 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14675 if (ctx->fmtcnt >= 0) {
14676 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14677 if (--ctx->fmtcnt >= 0) {
14678 arg->ch = FORMAT_READ(ctx);
14679 ctx->fmtpos++;
14680 }
14681 }
14682 }
14683 if (ctx->fmtcnt < 0) {
14684 PyErr_SetString(PyExc_ValueError,
14685 "incomplete format");
14686 return -1;
14687 }
14688 return 0;
14689
14690#undef FORMAT_READ
14691}
14692
14693/* Format one argument. Supported conversion specifiers:
14694
14695 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014696 - "i", "d", "u": int or float
14697 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014698 - "e", "E", "f", "F", "g", "G": float
14699 - "c": int or str (1 character)
14700
Victor Stinner8dbd4212012-12-04 09:30:24 +010014701 When possible, the output is written directly into the Unicode writer
14702 (ctx->writer). A string is created when padding is required.
14703
Victor Stinnera47082312012-10-04 02:19:54 +020014704 Return 0 if the argument has been formatted into *p_str,
14705 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014706 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014707static int
14708unicode_format_arg_format(struct unicode_formatter_t *ctx,
14709 struct unicode_format_arg_t *arg,
14710 PyObject **p_str)
14711{
14712 PyObject *v;
14713 _PyUnicodeWriter *writer = &ctx->writer;
14714
14715 if (ctx->fmtcnt == 0)
14716 ctx->writer.overallocate = 0;
14717
Victor Stinnera47082312012-10-04 02:19:54 +020014718 v = unicode_format_getnextarg(ctx);
14719 if (v == NULL)
14720 return -1;
14721
Victor Stinnera47082312012-10-04 02:19:54 +020014722
14723 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014724 case 's':
14725 case 'r':
14726 case 'a':
14727 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14728 /* Fast path */
14729 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14730 return -1;
14731 return 1;
14732 }
14733
14734 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14735 *p_str = v;
14736 Py_INCREF(*p_str);
14737 }
14738 else {
14739 if (arg->ch == 's')
14740 *p_str = PyObject_Str(v);
14741 else if (arg->ch == 'r')
14742 *p_str = PyObject_Repr(v);
14743 else
14744 *p_str = PyObject_ASCII(v);
14745 }
14746 break;
14747
14748 case 'i':
14749 case 'd':
14750 case 'u':
14751 case 'o':
14752 case 'x':
14753 case 'X':
14754 {
14755 int ret = mainformatlong(v, arg, p_str, writer);
14756 if (ret != 0)
14757 return ret;
14758 arg->sign = 1;
14759 break;
14760 }
14761
14762 case 'e':
14763 case 'E':
14764 case 'f':
14765 case 'F':
14766 case 'g':
14767 case 'G':
14768 if (arg->width == -1 && arg->prec == -1
14769 && !(arg->flags & (F_SIGN | F_BLANK)))
14770 {
14771 /* Fast path */
14772 if (formatfloat(v, arg, NULL, writer) == -1)
14773 return -1;
14774 return 1;
14775 }
14776
14777 arg->sign = 1;
14778 if (formatfloat(v, arg, p_str, NULL) == -1)
14779 return -1;
14780 break;
14781
14782 case 'c':
14783 {
14784 Py_UCS4 ch = formatchar(v);
14785 if (ch == (Py_UCS4) -1)
14786 return -1;
14787 if (arg->width == -1 && arg->prec == -1) {
14788 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014789 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014790 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014791 return 1;
14792 }
14793 *p_str = PyUnicode_FromOrdinal(ch);
14794 break;
14795 }
14796
14797 default:
14798 PyErr_Format(PyExc_ValueError,
14799 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014800 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014801 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14802 (int)arg->ch,
14803 ctx->fmtpos - 1);
14804 return -1;
14805 }
14806 if (*p_str == NULL)
14807 return -1;
14808 assert (PyUnicode_Check(*p_str));
14809 return 0;
14810}
14811
14812static int
14813unicode_format_arg_output(struct unicode_formatter_t *ctx,
14814 struct unicode_format_arg_t *arg,
14815 PyObject *str)
14816{
14817 Py_ssize_t len;
14818 enum PyUnicode_Kind kind;
14819 void *pbuf;
14820 Py_ssize_t pindex;
14821 Py_UCS4 signchar;
14822 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014823 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014824 Py_ssize_t sublen;
14825 _PyUnicodeWriter *writer = &ctx->writer;
14826 Py_UCS4 fill;
14827
14828 fill = ' ';
14829 if (arg->sign && arg->flags & F_ZERO)
14830 fill = '0';
14831
14832 if (PyUnicode_READY(str) == -1)
14833 return -1;
14834
14835 len = PyUnicode_GET_LENGTH(str);
14836 if ((arg->width == -1 || arg->width <= len)
14837 && (arg->prec == -1 || arg->prec >= len)
14838 && !(arg->flags & (F_SIGN | F_BLANK)))
14839 {
14840 /* Fast path */
14841 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14842 return -1;
14843 return 0;
14844 }
14845
14846 /* Truncate the string for "s", "r" and "a" formats
14847 if the precision is set */
14848 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14849 if (arg->prec >= 0 && len > arg->prec)
14850 len = arg->prec;
14851 }
14852
14853 /* Adjust sign and width */
14854 kind = PyUnicode_KIND(str);
14855 pbuf = PyUnicode_DATA(str);
14856 pindex = 0;
14857 signchar = '\0';
14858 if (arg->sign) {
14859 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14860 if (ch == '-' || ch == '+') {
14861 signchar = ch;
14862 len--;
14863 pindex++;
14864 }
14865 else if (arg->flags & F_SIGN)
14866 signchar = '+';
14867 else if (arg->flags & F_BLANK)
14868 signchar = ' ';
14869 else
14870 arg->sign = 0;
14871 }
14872 if (arg->width < len)
14873 arg->width = len;
14874
14875 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014876 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014877 if (!(arg->flags & F_LJUST)) {
14878 if (arg->sign) {
14879 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014880 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014881 }
14882 else {
14883 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014884 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014885 }
14886 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014887 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14888 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014889 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014890 }
14891
Victor Stinnera47082312012-10-04 02:19:54 +020014892 buflen = arg->width;
14893 if (arg->sign && len == arg->width)
14894 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014895 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014896 return -1;
14897
14898 /* Write the sign if needed */
14899 if (arg->sign) {
14900 if (fill != ' ') {
14901 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14902 writer->pos += 1;
14903 }
14904 if (arg->width > len)
14905 arg->width--;
14906 }
14907
14908 /* Write the numeric prefix for "x", "X" and "o" formats
14909 if the alternate form is used.
14910 For example, write "0x" for the "%#x" format. */
14911 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14912 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14913 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14914 if (fill != ' ') {
14915 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14916 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14917 writer->pos += 2;
14918 pindex += 2;
14919 }
14920 arg->width -= 2;
14921 if (arg->width < 0)
14922 arg->width = 0;
14923 len -= 2;
14924 }
14925
14926 /* Pad left with the fill character if needed */
14927 if (arg->width > len && !(arg->flags & F_LJUST)) {
14928 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014929 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014930 writer->pos += sublen;
14931 arg->width = len;
14932 }
14933
14934 /* If padding with spaces: write sign if needed and/or numeric prefix if
14935 the alternate form is used */
14936 if (fill == ' ') {
14937 if (arg->sign) {
14938 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14939 writer->pos += 1;
14940 }
14941 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14942 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14943 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14944 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14945 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14946 writer->pos += 2;
14947 pindex += 2;
14948 }
14949 }
14950
14951 /* Write characters */
14952 if (len) {
14953 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14954 str, pindex, len);
14955 writer->pos += len;
14956 }
14957
14958 /* Pad right with the fill character if needed */
14959 if (arg->width > len) {
14960 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014961 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014962 writer->pos += sublen;
14963 }
14964 return 0;
14965}
14966
14967/* Helper of PyUnicode_Format(): format one arg.
14968 Return 0 on success, raise an exception and return -1 on error. */
14969static int
14970unicode_format_arg(struct unicode_formatter_t *ctx)
14971{
14972 struct unicode_format_arg_t arg;
14973 PyObject *str;
14974 int ret;
14975
Victor Stinner8dbd4212012-12-04 09:30:24 +010014976 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014977 if (arg.ch == '%') {
14978 ctx->fmtpos++;
14979 ctx->fmtcnt--;
14980 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14981 return -1;
14982 return 0;
14983 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014984 arg.flags = 0;
14985 arg.width = -1;
14986 arg.prec = -1;
14987 arg.sign = 0;
14988 str = NULL;
14989
Victor Stinnera47082312012-10-04 02:19:54 +020014990 ret = unicode_format_arg_parse(ctx, &arg);
14991 if (ret == -1)
14992 return -1;
14993
14994 ret = unicode_format_arg_format(ctx, &arg, &str);
14995 if (ret == -1)
14996 return -1;
14997
14998 if (ret != 1) {
14999 ret = unicode_format_arg_output(ctx, &arg, str);
15000 Py_DECREF(str);
15001 if (ret == -1)
15002 return -1;
15003 }
15004
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015005 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015006 PyErr_SetString(PyExc_TypeError,
15007 "not all arguments converted during string formatting");
15008 return -1;
15009 }
15010 return 0;
15011}
15012
Alexander Belopolsky40018472011-02-26 01:02:56 +000015013PyObject *
15014PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015015{
Victor Stinnera47082312012-10-04 02:19:54 +020015016 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015017
Guido van Rossumd57fd912000-03-10 22:53:23 +000015018 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015019 PyErr_BadInternalCall();
15020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015021 }
Victor Stinnera47082312012-10-04 02:19:54 +020015022
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015023 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015024 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015025
15026 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015027 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15028 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15029 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15030 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015031
Victor Stinner8f674cc2013-04-17 23:02:17 +020015032 _PyUnicodeWriter_Init(&ctx.writer);
15033 ctx.writer.min_length = ctx.fmtcnt + 100;
15034 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015035
Guido van Rossumd57fd912000-03-10 22:53:23 +000015036 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015037 ctx.arglen = PyTuple_Size(args);
15038 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015039 }
15040 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015041 ctx.arglen = -1;
15042 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015043 }
Victor Stinnera47082312012-10-04 02:19:54 +020015044 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015045 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015046 ctx.dict = args;
15047 else
15048 ctx.dict = NULL;
15049 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015050
Victor Stinnera47082312012-10-04 02:19:54 +020015051 while (--ctx.fmtcnt >= 0) {
15052 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015053 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015054
15055 nonfmtpos = ctx.fmtpos++;
15056 while (ctx.fmtcnt >= 0 &&
15057 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15058 ctx.fmtpos++;
15059 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 }
Victor Stinnera47082312012-10-04 02:19:54 +020015061 if (ctx.fmtcnt < 0) {
15062 ctx.fmtpos--;
15063 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015064 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015065
Victor Stinnercfc4c132013-04-03 01:48:39 +020015066 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15067 nonfmtpos, ctx.fmtpos) < 0)
15068 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015069 }
15070 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015071 ctx.fmtpos++;
15072 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015073 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015074 }
15075 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015076
Victor Stinnera47082312012-10-04 02:19:54 +020015077 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015078 PyErr_SetString(PyExc_TypeError,
15079 "not all arguments converted during string formatting");
15080 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015081 }
15082
Victor Stinnera47082312012-10-04 02:19:54 +020015083 if (ctx.args_owned) {
15084 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015085 }
Victor Stinnera47082312012-10-04 02:19:54 +020015086 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015087
Benjamin Peterson29060642009-01-31 22:14:21 +000015088 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015089 _PyUnicodeWriter_Dealloc(&ctx.writer);
15090 if (ctx.args_owned) {
15091 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015092 }
15093 return NULL;
15094}
15095
Jeremy Hylton938ace62002-07-17 16:30:39 +000015096static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015097unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15098
Tim Peters6d6c1a32001-08-02 04:15:00 +000015099static PyObject *
15100unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15101{
Benjamin Peterson29060642009-01-31 22:14:21 +000015102 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015103 static char *kwlist[] = {"object", "encoding", "errors", 0};
15104 char *encoding = NULL;
15105 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015106
Benjamin Peterson14339b62009-01-31 16:36:08 +000015107 if (type != &PyUnicode_Type)
15108 return unicode_subtype_new(type, args, kwds);
15109 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015110 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 return NULL;
15112 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015113 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 if (encoding == NULL && errors == NULL)
15115 return PyObject_Str(x);
15116 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015117 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015118}
15119
Guido van Rossume023fe02001-08-30 03:12:59 +000015120static PyObject *
15121unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15122{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015123 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015124 Py_ssize_t length, char_size;
15125 int share_wstr, share_utf8;
15126 unsigned int kind;
15127 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015128
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015130
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015131 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015132 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015133 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015134 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015135 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015136 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015137 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015138 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015139
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015140 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015141 if (self == NULL) {
15142 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 return NULL;
15144 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015145 kind = PyUnicode_KIND(unicode);
15146 length = PyUnicode_GET_LENGTH(unicode);
15147
15148 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015149#ifdef Py_DEBUG
15150 _PyUnicode_HASH(self) = -1;
15151#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015152 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015153#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015154 _PyUnicode_STATE(self).interned = 0;
15155 _PyUnicode_STATE(self).kind = kind;
15156 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015157 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015158 _PyUnicode_STATE(self).ready = 1;
15159 _PyUnicode_WSTR(self) = NULL;
15160 _PyUnicode_UTF8_LENGTH(self) = 0;
15161 _PyUnicode_UTF8(self) = NULL;
15162 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015163 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015164
15165 share_utf8 = 0;
15166 share_wstr = 0;
15167 if (kind == PyUnicode_1BYTE_KIND) {
15168 char_size = 1;
15169 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15170 share_utf8 = 1;
15171 }
15172 else if (kind == PyUnicode_2BYTE_KIND) {
15173 char_size = 2;
15174 if (sizeof(wchar_t) == 2)
15175 share_wstr = 1;
15176 }
15177 else {
15178 assert(kind == PyUnicode_4BYTE_KIND);
15179 char_size = 4;
15180 if (sizeof(wchar_t) == 4)
15181 share_wstr = 1;
15182 }
15183
15184 /* Ensure we won't overflow the length. */
15185 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15186 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015187 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015188 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015189 data = PyObject_MALLOC((length + 1) * char_size);
15190 if (data == NULL) {
15191 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015192 goto onError;
15193 }
15194
Victor Stinnerc3c74152011-10-02 20:39:55 +020015195 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015196 if (share_utf8) {
15197 _PyUnicode_UTF8_LENGTH(self) = length;
15198 _PyUnicode_UTF8(self) = data;
15199 }
15200 if (share_wstr) {
15201 _PyUnicode_WSTR_LENGTH(self) = length;
15202 _PyUnicode_WSTR(self) = (wchar_t *)data;
15203 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015204
Christian Heimesf051e432016-09-13 20:22:02 +020015205 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015206 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015207 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015208#ifdef Py_DEBUG
15209 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15210#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015211 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015212 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015213
15214onError:
15215 Py_DECREF(unicode);
15216 Py_DECREF(self);
15217 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015218}
15219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015220PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015221"str(object='') -> str\n\
15222str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015223\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015224Create a new string object from the given object. If encoding or\n\
15225errors is specified, then the object must expose a data buffer\n\
15226that will be decoded using the given encoding and error handler.\n\
15227Otherwise, returns the result of object.__str__() (if defined)\n\
15228or repr(object).\n\
15229encoding defaults to sys.getdefaultencoding().\n\
15230errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015231
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015232static PyObject *unicode_iter(PyObject *seq);
15233
Guido van Rossumd57fd912000-03-10 22:53:23 +000015234PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015235 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015236 "str", /* tp_name */
15237 sizeof(PyUnicodeObject), /* tp_basicsize */
15238 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015239 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015240 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015241 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015242 0, /* tp_getattr */
15243 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015244 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015245 unicode_repr, /* tp_repr */
15246 &unicode_as_number, /* tp_as_number */
15247 &unicode_as_sequence, /* tp_as_sequence */
15248 &unicode_as_mapping, /* tp_as_mapping */
15249 (hashfunc) unicode_hash, /* tp_hash*/
15250 0, /* tp_call*/
15251 (reprfunc) unicode_str, /* tp_str */
15252 PyObject_GenericGetAttr, /* tp_getattro */
15253 0, /* tp_setattro */
15254 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015255 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015256 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15257 unicode_doc, /* tp_doc */
15258 0, /* tp_traverse */
15259 0, /* tp_clear */
15260 PyUnicode_RichCompare, /* tp_richcompare */
15261 0, /* tp_weaklistoffset */
15262 unicode_iter, /* tp_iter */
15263 0, /* tp_iternext */
15264 unicode_methods, /* tp_methods */
15265 0, /* tp_members */
15266 0, /* tp_getset */
15267 &PyBaseObject_Type, /* tp_base */
15268 0, /* tp_dict */
15269 0, /* tp_descr_get */
15270 0, /* tp_descr_set */
15271 0, /* tp_dictoffset */
15272 0, /* tp_init */
15273 0, /* tp_alloc */
15274 unicode_new, /* tp_new */
15275 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015276};
15277
15278/* Initialize the Unicode implementation */
15279
Victor Stinner331a6a52019-05-27 16:39:22 +020015280PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015281_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015282{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015283 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015284 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015285 0x000A, /* LINE FEED */
15286 0x000D, /* CARRIAGE RETURN */
15287 0x001C, /* FILE SEPARATOR */
15288 0x001D, /* GROUP SEPARATOR */
15289 0x001E, /* RECORD SEPARATOR */
15290 0x0085, /* NEXT LINE */
15291 0x2028, /* LINE SEPARATOR */
15292 0x2029, /* PARAGRAPH SEPARATOR */
15293 };
15294
Fred Drakee4315f52000-05-09 19:53:39 +000015295 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015296 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015297 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015298 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015299 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015300 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015301
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015302 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015303 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015304 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015305
15306 /* initialize the linebreak bloom filter */
15307 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015308 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015309 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015310
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015311 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015312 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015313 }
15314 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015315 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015316 }
15317 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015318 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015319 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015320 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015321}
15322
15323/* Finalize the Unicode implementation */
15324
Christian Heimesa156e092008-02-16 07:38:31 +000015325int
15326PyUnicode_ClearFreeList(void)
15327{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015328 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015329}
15330
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015331
Walter Dörwald16807132007-05-25 13:52:07 +000015332void
15333PyUnicode_InternInPlace(PyObject **p)
15334{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015335 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015337#ifdef Py_DEBUG
15338 assert(s != NULL);
15339 assert(_PyUnicode_CHECK(s));
15340#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015342 return;
15343#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 /* If it's a subclass, we don't really know what putting
15345 it in the interned dict might do. */
15346 if (!PyUnicode_CheckExact(s))
15347 return;
15348 if (PyUnicode_CHECK_INTERNED(s))
15349 return;
15350 if (interned == NULL) {
15351 interned = PyDict_New();
15352 if (interned == NULL) {
15353 PyErr_Clear(); /* Don't leave an exception */
15354 return;
15355 }
15356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015358 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015360 if (t == NULL) {
15361 PyErr_Clear();
15362 return;
15363 }
15364 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015365 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015366 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015367 return;
15368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 /* The two references in interned are not counted by refcnt.
15370 The deallocator will take care of this */
15371 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015372 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015373}
15374
15375void
15376PyUnicode_InternImmortal(PyObject **p)
15377{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 PyUnicode_InternInPlace(p);
15379 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015380 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 Py_INCREF(*p);
15382 }
Walter Dörwald16807132007-05-25 13:52:07 +000015383}
15384
15385PyObject *
15386PyUnicode_InternFromString(const char *cp)
15387{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 PyObject *s = PyUnicode_FromString(cp);
15389 if (s == NULL)
15390 return NULL;
15391 PyUnicode_InternInPlace(&s);
15392 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015393}
15394
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015395
15396#if defined(WITH_VALGRIND) || defined(__INSURE__)
15397static void
15398unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015401 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 Py_ssize_t i, n;
15403 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015404
Benjamin Peterson14339b62009-01-31 16:36:08 +000015405 if (interned == NULL || !PyDict_Check(interned))
15406 return;
15407 keys = PyDict_Keys(interned);
15408 if (keys == NULL || !PyList_Check(keys)) {
15409 PyErr_Clear();
15410 return;
15411 }
Walter Dörwald16807132007-05-25 13:52:07 +000015412
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015413 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 detector, interned unicode strings are not forcibly deallocated;
15415 rather, we give them their stolen references back, and then clear
15416 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015417
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015419#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015420 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015421 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015422#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015424 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015425 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015426 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015428 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015429 case SSTATE_NOT_INTERNED:
15430 /* XXX Shouldn't happen */
15431 break;
15432 case SSTATE_INTERNED_IMMORTAL:
15433 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015434 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 break;
15436 case SSTATE_INTERNED_MORTAL:
15437 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015438 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 break;
15440 default:
15441 Py_FatalError("Inconsistent interned string state.");
15442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015443 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015445#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 fprintf(stderr, "total size of all interned strings: "
15447 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15448 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015449#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 Py_DECREF(keys);
15451 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015452 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015453}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015454#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015455
15456
15457/********************* Unicode Iterator **************************/
15458
15459typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 PyObject_HEAD
15461 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015462 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015463} unicodeiterobject;
15464
15465static void
15466unicodeiter_dealloc(unicodeiterobject *it)
15467{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 _PyObject_GC_UNTRACK(it);
15469 Py_XDECREF(it->it_seq);
15470 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015471}
15472
15473static int
15474unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15475{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 Py_VISIT(it->it_seq);
15477 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015478}
15479
15480static PyObject *
15481unicodeiter_next(unicodeiterobject *it)
15482{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015483 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015484
Benjamin Peterson14339b62009-01-31 16:36:08 +000015485 assert(it != NULL);
15486 seq = it->it_seq;
15487 if (seq == NULL)
15488 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015489 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015491 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15492 int kind = PyUnicode_KIND(seq);
15493 void *data = PyUnicode_DATA(seq);
15494 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15495 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 if (item != NULL)
15497 ++it->it_index;
15498 return item;
15499 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015500
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015502 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015504}
15505
15506static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015507unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015508{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015509 Py_ssize_t len = 0;
15510 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015511 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015513}
15514
15515PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15516
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015517static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015518unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015519{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015520 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015521 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015522 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015523 it->it_seq, it->it_index);
15524 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015525 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015526 if (u == NULL)
15527 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015528 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015529 }
15530}
15531
15532PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15533
15534static PyObject *
15535unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15536{
15537 Py_ssize_t index = PyLong_AsSsize_t(state);
15538 if (index == -1 && PyErr_Occurred())
15539 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015540 if (it->it_seq != NULL) {
15541 if (index < 0)
15542 index = 0;
15543 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15544 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15545 it->it_index = index;
15546 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015547 Py_RETURN_NONE;
15548}
15549
15550PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15551
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015552static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015553 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015554 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015555 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15556 reduce_doc},
15557 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15558 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015559 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015560};
15561
15562PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015563 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15564 "str_iterator", /* tp_name */
15565 sizeof(unicodeiterobject), /* tp_basicsize */
15566 0, /* tp_itemsize */
15567 /* methods */
15568 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015569 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015570 0, /* tp_getattr */
15571 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015572 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015573 0, /* tp_repr */
15574 0, /* tp_as_number */
15575 0, /* tp_as_sequence */
15576 0, /* tp_as_mapping */
15577 0, /* tp_hash */
15578 0, /* tp_call */
15579 0, /* tp_str */
15580 PyObject_GenericGetAttr, /* tp_getattro */
15581 0, /* tp_setattro */
15582 0, /* tp_as_buffer */
15583 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15584 0, /* tp_doc */
15585 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15586 0, /* tp_clear */
15587 0, /* tp_richcompare */
15588 0, /* tp_weaklistoffset */
15589 PyObject_SelfIter, /* tp_iter */
15590 (iternextfunc)unicodeiter_next, /* tp_iternext */
15591 unicodeiter_methods, /* tp_methods */
15592 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015593};
15594
15595static PyObject *
15596unicode_iter(PyObject *seq)
15597{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015599
Benjamin Peterson14339b62009-01-31 16:36:08 +000015600 if (!PyUnicode_Check(seq)) {
15601 PyErr_BadInternalCall();
15602 return NULL;
15603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015604 if (PyUnicode_READY(seq) == -1)
15605 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015606 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15607 if (it == NULL)
15608 return NULL;
15609 it->it_index = 0;
15610 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015611 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015612 _PyObject_GC_TRACK(it);
15613 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015614}
15615
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015616
15617size_t
15618Py_UNICODE_strlen(const Py_UNICODE *u)
15619{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015620 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015621}
15622
15623Py_UNICODE*
15624Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15625{
15626 Py_UNICODE *u = s1;
15627 while ((*u++ = *s2++));
15628 return s1;
15629}
15630
15631Py_UNICODE*
15632Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15633{
15634 Py_UNICODE *u = s1;
15635 while ((*u++ = *s2++))
15636 if (n-- == 0)
15637 break;
15638 return s1;
15639}
15640
15641Py_UNICODE*
15642Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15643{
15644 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015645 u1 += wcslen(u1);
15646 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015647 return s1;
15648}
15649
15650int
15651Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15652{
15653 while (*s1 && *s2 && *s1 == *s2)
15654 s1++, s2++;
15655 if (*s1 && *s2)
15656 return (*s1 < *s2) ? -1 : +1;
15657 if (*s1)
15658 return 1;
15659 if (*s2)
15660 return -1;
15661 return 0;
15662}
15663
15664int
15665Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15666{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015667 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015668 for (; n != 0; n--) {
15669 u1 = *s1;
15670 u2 = *s2;
15671 if (u1 != u2)
15672 return (u1 < u2) ? -1 : +1;
15673 if (u1 == '\0')
15674 return 0;
15675 s1++;
15676 s2++;
15677 }
15678 return 0;
15679}
15680
15681Py_UNICODE*
15682Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15683{
15684 const Py_UNICODE *p;
15685 for (p = s; *p; p++)
15686 if (*p == c)
15687 return (Py_UNICODE*)p;
15688 return NULL;
15689}
15690
15691Py_UNICODE*
15692Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15693{
15694 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015695 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015696 while (p != s) {
15697 p--;
15698 if (*p == c)
15699 return (Py_UNICODE*)p;
15700 }
15701 return NULL;
15702}
Victor Stinner331ea922010-08-10 16:37:20 +000015703
Victor Stinner71133ff2010-09-01 23:43:53 +000015704Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015705PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015706{
Victor Stinner577db2c2011-10-11 22:12:48 +020015707 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015708 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015710 if (!PyUnicode_Check(unicode)) {
15711 PyErr_BadArgument();
15712 return NULL;
15713 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015714 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015715 if (u == NULL)
15716 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015717 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015718 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015719 PyErr_NoMemory();
15720 return NULL;
15721 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015722 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015723 size *= sizeof(Py_UNICODE);
15724 copy = PyMem_Malloc(size);
15725 if (copy == NULL) {
15726 PyErr_NoMemory();
15727 return NULL;
15728 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015729 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015730 return copy;
15731}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015732
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015733
Victor Stinner709d23d2019-05-02 14:56:30 -040015734static int
15735encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015736{
Victor Stinner709d23d2019-05-02 14:56:30 -040015737 int res;
15738 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15739 if (res == -2) {
15740 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15741 return -1;
15742 }
15743 if (res < 0) {
15744 PyErr_NoMemory();
15745 return -1;
15746 }
15747 return 0;
15748}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015749
Victor Stinner709d23d2019-05-02 14:56:30 -040015750
15751static int
15752config_get_codec_name(wchar_t **config_encoding)
15753{
15754 char *encoding;
15755 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15756 return -1;
15757 }
15758
15759 PyObject *name_obj = NULL;
15760 PyObject *codec = _PyCodec_Lookup(encoding);
15761 PyMem_RawFree(encoding);
15762
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015763 if (!codec)
15764 goto error;
15765
15766 name_obj = PyObject_GetAttrString(codec, "name");
15767 Py_CLEAR(codec);
15768 if (!name_obj) {
15769 goto error;
15770 }
15771
Victor Stinner709d23d2019-05-02 14:56:30 -040015772 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15773 Py_DECREF(name_obj);
15774 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015775 goto error;
15776 }
15777
Victor Stinner709d23d2019-05-02 14:56:30 -040015778 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15779 if (raw_wname == NULL) {
15780 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015781 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015782 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015783 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015784
15785 PyMem_RawFree(*config_encoding);
15786 *config_encoding = raw_wname;
15787
15788 PyMem_Free(wname);
15789 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015790
15791error:
15792 Py_XDECREF(codec);
15793 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015794 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015795}
15796
15797
Victor Stinner331a6a52019-05-27 16:39:22 +020015798static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015799init_stdio_encoding(PyInterpreterState *interp)
15800{
Victor Stinner709d23d2019-05-02 14:56:30 -040015801 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015802 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015804 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015805 "of the stdio encoding");
15806 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015807 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808}
15809
15810
Victor Stinner709d23d2019-05-02 14:56:30 -040015811static int
15812init_fs_codec(PyInterpreterState *interp)
15813{
Victor Stinner331a6a52019-05-27 16:39:22 +020015814 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015815
15816 _Py_error_handler error_handler;
15817 error_handler = get_error_handler_wide(config->filesystem_errors);
15818 if (error_handler == _Py_ERROR_UNKNOWN) {
15819 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15820 return -1;
15821 }
15822
15823 char *encoding, *errors;
15824 if (encode_wstr_utf8(config->filesystem_encoding,
15825 &encoding,
15826 "filesystem_encoding") < 0) {
15827 return -1;
15828 }
15829
15830 if (encode_wstr_utf8(config->filesystem_errors,
15831 &errors,
15832 "filesystem_errors") < 0) {
15833 PyMem_RawFree(encoding);
15834 return -1;
15835 }
15836
15837 PyMem_RawFree(interp->fs_codec.encoding);
15838 interp->fs_codec.encoding = encoding;
15839 PyMem_RawFree(interp->fs_codec.errors);
15840 interp->fs_codec.errors = errors;
15841 interp->fs_codec.error_handler = error_handler;
15842
15843 /* At this point, PyUnicode_EncodeFSDefault() and
15844 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15845 the C implementation of the filesystem encoding. */
15846
15847 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15848 global configuration variables. */
15849 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15850 interp->fs_codec.errors) < 0) {
15851 PyErr_NoMemory();
15852 return -1;
15853 }
15854 return 0;
15855}
15856
15857
Victor Stinner331a6a52019-05-27 16:39:22 +020015858static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015859init_fs_encoding(PyInterpreterState *interp)
15860{
Victor Stinner709d23d2019-05-02 14:56:30 -040015861 /* Update the filesystem encoding to the normalized Python codec name.
15862 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15863 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015864 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015865 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015866 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015867 "of the filesystem encoding");
15868 }
15869
Victor Stinner709d23d2019-05-02 14:56:30 -040015870 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015871 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015872 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015873 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015874}
15875
15876
Victor Stinner331a6a52019-05-27 16:39:22 +020015877PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015878_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015879{
Victor Stinnerb45d2592019-06-20 00:05:23 +020015880 PyInterpreterState *interp = tstate->interp;
15881
Victor Stinner331a6a52019-05-27 16:39:22 +020015882 PyStatus status = init_fs_encoding(interp);
15883 if (_PyStatus_EXCEPTION(status)) {
15884 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015885 }
15886
15887 return init_stdio_encoding(interp);
15888}
15889
15890
Victor Stinner709d23d2019-05-02 14:56:30 -040015891#ifdef MS_WINDOWS
15892int
15893_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15894{
15895 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015896 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015897
15898 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15899 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15900 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15901 if (encoding == NULL || errors == NULL) {
15902 PyMem_RawFree(encoding);
15903 PyMem_RawFree(errors);
15904 PyErr_NoMemory();
15905 return -1;
15906 }
15907
15908 PyMem_RawFree(config->filesystem_encoding);
15909 config->filesystem_encoding = encoding;
15910 PyMem_RawFree(config->filesystem_errors);
15911 config->filesystem_errors = errors;
15912
15913 return init_fs_codec(interp);
15914}
15915#endif
15916
15917
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015918void
15919_PyUnicode_Fini(void)
15920{
15921#if defined(WITH_VALGRIND) || defined(__INSURE__)
15922 /* Insure++ is a memory analysis tool that aids in discovering
15923 * memory leaks and other memory problems. On Python exit, the
15924 * interned string dictionaries are flagged as being in use at exit
15925 * (which it is). Under normal circumstances, this is fine because
15926 * the memory will be automatically reclaimed by the system. Under
15927 * memory debugging, it's a huge source of useless noise, so we
15928 * trade off slower shutdown for less distraction in the memory
15929 * reports. -baw
15930 */
15931 unicode_release_interned();
15932#endif /* __INSURE__ */
15933
15934 Py_CLEAR(unicode_empty);
15935
15936 for (Py_ssize_t i = 0; i < 256; i++) {
15937 Py_CLEAR(unicode_latin1[i]);
15938 }
15939 _PyUnicode_ClearStaticStrings();
15940 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015941
15942 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15943 PyMem_RawFree(interp->fs_codec.encoding);
15944 interp->fs_codec.encoding = NULL;
15945 PyMem_RawFree(interp->fs_codec.errors);
15946 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015947}
15948
15949
Georg Brandl66c221e2010-10-14 07:04:07 +000015950/* A _string module, to export formatter_parser and formatter_field_name_split
15951 to the string.Formatter class implemented in Python. */
15952
15953static PyMethodDef _string_methods[] = {
15954 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15955 METH_O, PyDoc_STR("split the argument as a field name")},
15956 {"formatter_parser", (PyCFunction) formatter_parser,
15957 METH_O, PyDoc_STR("parse the argument as a format string")},
15958 {NULL, NULL}
15959};
15960
15961static struct PyModuleDef _string_module = {
15962 PyModuleDef_HEAD_INIT,
15963 "_string",
15964 PyDoc_STR("string helper module"),
15965 0,
15966 _string_methods,
15967 NULL,
15968 NULL,
15969 NULL,
15970 NULL
15971};
15972
15973PyMODINIT_FUNC
15974PyInit__string(void)
15975{
15976 return PyModule_Create(&_string_module);
15977}
15978
15979
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015980#ifdef __cplusplus
15981}
15982#endif