blob: 625be4b5594b153cfe7ebbc7aa59d72c406ec5a2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900268static inline void
269_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400270static PyObject *
271unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
272 const char *errors);
273static PyObject *
274unicode_decode_utf8(const char *s, Py_ssize_t size,
275 _Py_error_handler error_handler, const char *errors,
276 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200279static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281/* Single character Unicode strings in the Latin-1 range are being
282 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200283static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284
Christian Heimes190d79e2008-01-30 11:58:22 +0000285/* Fast detection of the most frequent whitespace characters */
286const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000C: * FORM FEED */
292/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 1, 1, 1, 1, 1, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x001C: * FILE SEPARATOR */
296/* case 0x001D: * GROUP SEPARATOR */
297/* case 0x001E: * RECORD SEPARATOR */
298/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 1, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200317static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100319static int unicode_modifiable(PyObject *unicode);
320
Victor Stinnerfe226c02011-10-03 03:52:20 +0200321
Alexander Belopolsky40018472011-02-26 01:02:56 +0000322static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100323_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200324static PyObject *
325_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
326static PyObject *
327_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
328
329static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000330unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100332 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
334
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335static void
336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100338 PyObject *unicode,
339 Py_ssize_t startpos, Py_ssize_t endpos,
340 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000341
Christian Heimes190d79e2008-01-30 11:58:22 +0000342/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200343static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000345/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000346/* 0x000B, * LINE TABULATION */
347/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x001C, * FILE SEPARATOR */
352/* 0x001D, * GROUP SEPARATOR */
353/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 1, 1, 1, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000359
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000368};
369
INADA Naoki3ae20562017-01-16 20:41:20 +0900370static int convert_uc(PyObject *obj, void *addr);
371
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300372#include "clinic/unicodeobject.c.h"
373
Victor Stinner3d4226a2018-08-29 22:21:32 +0200374_Py_error_handler
375_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200376{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200378 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 }
380 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200387 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_OTHER;
399}
400
Victor Stinner709d23d2019-05-02 14:56:30 -0400401
402static _Py_error_handler
403get_error_handler_wide(const wchar_t *errors)
404{
405 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
406 return _Py_ERROR_STRICT;
407 }
408 if (wcscmp(errors, L"surrogateescape") == 0) {
409 return _Py_ERROR_SURROGATEESCAPE;
410 }
411 if (wcscmp(errors, L"replace") == 0) {
412 return _Py_ERROR_REPLACE;
413 }
414 if (wcscmp(errors, L"ignore") == 0) {
415 return _Py_ERROR_IGNORE;
416 }
417 if (wcscmp(errors, L"backslashreplace") == 0) {
418 return _Py_ERROR_BACKSLASHREPLACE;
419 }
420 if (wcscmp(errors, L"surrogatepass") == 0) {
421 return _Py_ERROR_SURROGATEPASS;
422 }
423 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
424 return _Py_ERROR_XMLCHARREFREPLACE;
425 }
426 return _Py_ERROR_OTHER;
427}
428
429
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300430/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
431 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000433PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000434{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000435#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000437#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000438 /* This is actually an illegal character, so it should
439 not be passed to unichr. */
440 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000441#endif
442}
443
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200444int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100445_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200446{
447 PyASCIIObject *ascii;
448 unsigned int kind;
449
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200450 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200451
452 ascii = (PyASCIIObject *)op;
453 kind = ascii->state.kind;
454
Victor Stinnera3b334d2011-10-03 13:53:37 +0200455 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200456 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
457 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200458 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200459 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200460 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200462
Victor Stinnera41463c2011-10-04 01:05:08 +0200463 if (ascii->state.compact == 1) {
464 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200465 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
466 || kind == PyUnicode_2BYTE_KIND
467 || kind == PyUnicode_4BYTE_KIND);
468 _PyObject_ASSERT(op, ascii->state.ascii == 0);
469 _PyObject_ASSERT(op, ascii->state.ready == 1);
470 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100471 }
472 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200473 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
474
475 data = unicode->data.any;
476 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200477 _PyObject_ASSERT(op, ascii->length == 0);
478 _PyObject_ASSERT(op, ascii->hash == -1);
479 _PyObject_ASSERT(op, ascii->state.compact == 0);
480 _PyObject_ASSERT(op, ascii->state.ascii == 0);
481 _PyObject_ASSERT(op, ascii->state.ready == 0);
482 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
483 _PyObject_ASSERT(op, ascii->wstr != NULL);
484 _PyObject_ASSERT(op, data == NULL);
485 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200486 }
487 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200488 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
489 || kind == PyUnicode_2BYTE_KIND
490 || kind == PyUnicode_4BYTE_KIND);
491 _PyObject_ASSERT(op, ascii->state.compact == 0);
492 _PyObject_ASSERT(op, ascii->state.ready == 1);
493 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200494 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200495 _PyObject_ASSERT(op, compact->utf8 == data);
496 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200497 }
498 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200499 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200500 }
501 }
502 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200503 if (
504#if SIZEOF_WCHAR_T == 2
505 kind == PyUnicode_2BYTE_KIND
506#else
507 kind == PyUnicode_4BYTE_KIND
508#endif
509 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200511 _PyObject_ASSERT(op, ascii->wstr == data);
512 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200513 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200514 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200515 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200516
517 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200521 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200522
523 /* check that the best kind is used: O(n) operation */
524 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200525 Py_ssize_t i;
526 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200527 void *data;
528 Py_UCS4 ch;
529
530 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200531 for (i=0; i < ascii->length; i++)
532 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200533 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200534 if (ch > maxchar)
535 maxchar = ch;
536 }
537 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100538 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200539 _PyObject_ASSERT(op, maxchar >= 128);
540 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100541 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200542 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200544 }
Victor Stinner77faf692011-11-20 18:56:05 +0100545 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200546 _PyObject_ASSERT(op, maxchar >= 0x100);
547 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100548 }
549 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200550 _PyObject_ASSERT(op, maxchar >= 0x10000);
551 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100552 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200553 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200554 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400555 return 1;
556}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200557
Victor Stinner910337b2011-10-03 03:20:16 +0200558
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100559static PyObject*
560unicode_result_wchar(PyObject *unicode)
561{
562#ifndef Py_DEBUG
563 Py_ssize_t len;
564
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 len = _PyUnicode_WSTR_LENGTH(unicode);
566 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100567 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200568 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100569 }
570
571 if (len == 1) {
572 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100573 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100574 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
575 Py_DECREF(unicode);
576 return latin1_char;
577 }
578 }
579
580 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200581 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100582 return NULL;
583 }
584#else
Victor Stinneraa771272012-10-04 02:32:58 +0200585 assert(Py_REFCNT(unicode) == 1);
586
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100587 /* don't make the result ready in debug mode to ensure that the caller
588 makes the string ready before using it */
589 assert(_PyUnicode_CheckConsistency(unicode, 1));
590#endif
591 return unicode;
592}
593
594static PyObject*
595unicode_result_ready(PyObject *unicode)
596{
597 Py_ssize_t length;
598
599 length = PyUnicode_GET_LENGTH(unicode);
600 if (length == 0) {
601 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100602 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200603 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100604 }
605 return unicode_empty;
606 }
607
608 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200609 void *data = PyUnicode_DATA(unicode);
610 int kind = PyUnicode_KIND(unicode);
611 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100612 if (ch < 256) {
613 PyObject *latin1_char = unicode_latin1[ch];
614 if (latin1_char != NULL) {
615 if (unicode != latin1_char) {
616 Py_INCREF(latin1_char);
617 Py_DECREF(unicode);
618 }
619 return latin1_char;
620 }
621 else {
622 assert(_PyUnicode_CheckConsistency(unicode, 1));
623 Py_INCREF(unicode);
624 unicode_latin1[ch] = unicode;
625 return unicode;
626 }
627 }
628 }
629
630 assert(_PyUnicode_CheckConsistency(unicode, 1));
631 return unicode;
632}
633
634static PyObject*
635unicode_result(PyObject *unicode)
636{
637 assert(_PyUnicode_CHECK(unicode));
638 if (PyUnicode_IS_READY(unicode))
639 return unicode_result_ready(unicode);
640 else
641 return unicode_result_wchar(unicode);
642}
643
Victor Stinnerc4b49542011-12-11 22:44:26 +0100644static PyObject*
645unicode_result_unchanged(PyObject *unicode)
646{
647 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500648 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100649 return NULL;
650 Py_INCREF(unicode);
651 return unicode;
652 }
653 else
654 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100655 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100656}
657
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
659 ASCII, Latin1, UTF-8, etc. */
660static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200661backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
663{
Victor Stinnerad771582015-10-09 12:38:53 +0200664 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200665 Py_UCS4 ch;
666 enum PyUnicode_Kind kind;
667 void *data;
668
669 assert(PyUnicode_IS_READY(unicode));
670 kind = PyUnicode_KIND(unicode);
671 data = PyUnicode_DATA(unicode);
672
673 size = 0;
674 /* determine replacement size */
675 for (i = collstart; i < collend; ++i) {
676 Py_ssize_t incr;
677
678 ch = PyUnicode_READ(kind, data, i);
679 if (ch < 0x100)
680 incr = 2+2;
681 else if (ch < 0x10000)
682 incr = 2+4;
683 else {
684 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200685 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200686 }
687 if (size > PY_SSIZE_T_MAX - incr) {
688 PyErr_SetString(PyExc_OverflowError,
689 "encoded result is too long for a Python string");
690 return NULL;
691 }
692 size += incr;
693 }
694
Victor Stinnerad771582015-10-09 12:38:53 +0200695 str = _PyBytesWriter_Prepare(writer, str, size);
696 if (str == NULL)
697 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200698
699 /* generate replacement */
700 for (i = collstart; i < collend; ++i) {
701 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200702 *str++ = '\\';
703 if (ch >= 0x00010000) {
704 *str++ = 'U';
705 *str++ = Py_hexdigits[(ch>>28)&0xf];
706 *str++ = Py_hexdigits[(ch>>24)&0xf];
707 *str++ = Py_hexdigits[(ch>>20)&0xf];
708 *str++ = Py_hexdigits[(ch>>16)&0xf];
709 *str++ = Py_hexdigits[(ch>>12)&0xf];
710 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 }
Victor Stinner797485e2015-10-09 03:17:30 +0200712 else if (ch >= 0x100) {
713 *str++ = 'u';
714 *str++ = Py_hexdigits[(ch>>12)&0xf];
715 *str++ = Py_hexdigits[(ch>>8)&0xf];
716 }
717 else
718 *str++ = 'x';
719 *str++ = Py_hexdigits[(ch>>4)&0xf];
720 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200721 }
722 return str;
723}
724
725/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
726 ASCII, Latin1, UTF-8, etc. */
727static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200728xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200729 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
730{
Victor Stinnerad771582015-10-09 12:38:53 +0200731 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200732 Py_UCS4 ch;
733 enum PyUnicode_Kind kind;
734 void *data;
735
736 assert(PyUnicode_IS_READY(unicode));
737 kind = PyUnicode_KIND(unicode);
738 data = PyUnicode_DATA(unicode);
739
740 size = 0;
741 /* determine replacement size */
742 for (i = collstart; i < collend; ++i) {
743 Py_ssize_t incr;
744
745 ch = PyUnicode_READ(kind, data, i);
746 if (ch < 10)
747 incr = 2+1+1;
748 else if (ch < 100)
749 incr = 2+2+1;
750 else if (ch < 1000)
751 incr = 2+3+1;
752 else if (ch < 10000)
753 incr = 2+4+1;
754 else if (ch < 100000)
755 incr = 2+5+1;
756 else if (ch < 1000000)
757 incr = 2+6+1;
758 else {
759 assert(ch <= MAX_UNICODE);
760 incr = 2+7+1;
761 }
762 if (size > PY_SSIZE_T_MAX - incr) {
763 PyErr_SetString(PyExc_OverflowError,
764 "encoded result is too long for a Python string");
765 return NULL;
766 }
767 size += incr;
768 }
769
Victor Stinnerad771582015-10-09 12:38:53 +0200770 str = _PyBytesWriter_Prepare(writer, str, size);
771 if (str == NULL)
772 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200773
774 /* generate replacement */
775 for (i = collstart; i < collend; ++i) {
776 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
777 }
778 return str;
779}
780
Thomas Wouters477c8d52006-05-27 19:21:47 +0000781/* --- Bloom Filters ----------------------------------------------------- */
782
783/* stuff to implement simple "bloom filters" for Unicode characters.
784 to keep things simple, we use a single bitmask, using the least 5
785 bits from each unicode characters as the bit index. */
786
787/* the linebreak mask is set up by Unicode_Init below */
788
Antoine Pitrouf068f942010-01-13 14:19:12 +0000789#if LONG_BIT >= 128
790#define BLOOM_WIDTH 128
791#elif LONG_BIT >= 64
792#define BLOOM_WIDTH 64
793#elif LONG_BIT >= 32
794#define BLOOM_WIDTH 32
795#else
796#error "LONG_BIT is smaller than 32"
797#endif
798
Thomas Wouters477c8d52006-05-27 19:21:47 +0000799#define BLOOM_MASK unsigned long
800
Serhiy Storchaka05997252013-01-26 12:14:02 +0200801static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802
Antoine Pitrouf068f942010-01-13 14:19:12 +0000803#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000804
Benjamin Peterson29060642009-01-31 22:14:21 +0000805#define BLOOM_LINEBREAK(ch) \
806 ((ch) < 128U ? ascii_linebreak[(ch)] : \
807 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000808
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700809static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811{
Victor Stinnera85af502013-04-09 21:53:54 +0200812#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
813 do { \
814 TYPE *data = (TYPE *)PTR; \
815 TYPE *end = data + LEN; \
816 Py_UCS4 ch; \
817 for (; data != end; data++) { \
818 ch = *data; \
819 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
820 } \
821 break; \
822 } while (0)
823
Thomas Wouters477c8d52006-05-27 19:21:47 +0000824 /* calculate simple bloom-style bitmask for a given unicode string */
825
Antoine Pitrouf068f942010-01-13 14:19:12 +0000826 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000827
828 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200829 switch (kind) {
830 case PyUnicode_1BYTE_KIND:
831 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
832 break;
833 case PyUnicode_2BYTE_KIND:
834 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
835 break;
836 case PyUnicode_4BYTE_KIND:
837 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
838 break;
839 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700840 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200841 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000842 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200843
844#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000845}
846
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300847static int
848ensure_unicode(PyObject *obj)
849{
850 if (!PyUnicode_Check(obj)) {
851 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200852 "must be str, not %.100s",
853 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300854 return -1;
855 }
856 return PyUnicode_READY(obj);
857}
858
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200859/* Compilation of templated routines */
860
861#include "stringlib/asciilib.h"
862#include "stringlib/fastsearch.h"
863#include "stringlib/partition.h"
864#include "stringlib/split.h"
865#include "stringlib/count.h"
866#include "stringlib/find.h"
867#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200868#include "stringlib/undef.h"
869
870#include "stringlib/ucs1lib.h"
871#include "stringlib/fastsearch.h"
872#include "stringlib/partition.h"
873#include "stringlib/split.h"
874#include "stringlib/count.h"
875#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300876#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200877#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200878#include "stringlib/undef.h"
879
880#include "stringlib/ucs2lib.h"
881#include "stringlib/fastsearch.h"
882#include "stringlib/partition.h"
883#include "stringlib/split.h"
884#include "stringlib/count.h"
885#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300886#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200887#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200888#include "stringlib/undef.h"
889
890#include "stringlib/ucs4lib.h"
891#include "stringlib/fastsearch.h"
892#include "stringlib/partition.h"
893#include "stringlib/split.h"
894#include "stringlib/count.h"
895#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300896#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200897#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200898#include "stringlib/undef.h"
899
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200900#include "stringlib/unicodedefs.h"
901#include "stringlib/fastsearch.h"
902#include "stringlib/count.h"
903#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100904#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200905
Guido van Rossumd57fd912000-03-10 22:53:23 +0000906/* --- Unicode Object ----------------------------------------------------- */
907
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700908static inline Py_ssize_t
909findchar(const void *s, int kind,
910 Py_ssize_t size, Py_UCS4 ch,
911 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200913 switch (kind) {
914 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200915 if ((Py_UCS1) ch != ch)
916 return -1;
917 if (direction > 0)
918 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
919 else
920 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200921 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200922 if ((Py_UCS2) ch != ch)
923 return -1;
924 if (direction > 0)
925 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
926 else
927 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200928 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200929 if (direction > 0)
930 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
931 else
932 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200933 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700934 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936}
937
Victor Stinnerafffce42012-10-03 23:03:17 +0200938#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000939/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200940 earlier.
941
942 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
943 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
944 invalid character in Unicode 6.0. */
945static void
946unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
947{
948 int kind = PyUnicode_KIND(unicode);
949 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
950 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
951 if (length <= old_length)
952 return;
953 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
954}
955#endif
956
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957static PyObject*
958resize_compact(PyObject *unicode, Py_ssize_t length)
959{
960 Py_ssize_t char_size;
961 Py_ssize_t struct_size;
962 Py_ssize_t new_size;
963 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100964 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200965#ifdef Py_DEBUG
966 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
967#endif
968
Victor Stinner79891572012-05-03 13:43:07 +0200969 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100971 assert(PyUnicode_IS_COMPACT(unicode));
972
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200973 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100974 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200975 struct_size = sizeof(PyASCIIObject);
976 else
977 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200978 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
981 PyErr_NoMemory();
982 return NULL;
983 }
984 new_size = (struct_size + (length + 1) * char_size);
985
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200986 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
987 PyObject_DEL(_PyUnicode_UTF8(unicode));
988 _PyUnicode_UTF8(unicode) = NULL;
989 _PyUnicode_UTF8_LENGTH(unicode) = 0;
990 }
Victor Stinner84def372011-12-11 20:04:56 +0100991 _Py_DEC_REFTOTAL;
992 _Py_ForgetReference(unicode);
993
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300994 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100995 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100996 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200997 PyErr_NoMemory();
998 return NULL;
999 }
Victor Stinner84def372011-12-11 20:04:56 +01001000 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001002
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001005 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001006 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001007 _PyUnicode_WSTR_LENGTH(unicode) = length;
1008 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001009 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1010 PyObject_DEL(_PyUnicode_WSTR(unicode));
1011 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001012 if (!PyUnicode_IS_ASCII(unicode))
1013 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001014 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001015#ifdef Py_DEBUG
1016 unicode_fill_invalid(unicode, old_length);
1017#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1019 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021 return unicode;
1022}
1023
Alexander Belopolsky40018472011-02-26 01:02:56 +00001024static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001025resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026{
Victor Stinner95663112011-10-04 01:03:50 +02001027 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001031
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 if (PyUnicode_IS_READY(unicode)) {
1033 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001034 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001036#ifdef Py_DEBUG
1037 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1038#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039
1040 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001041 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001042 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1043 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044
1045 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1046 PyErr_NoMemory();
1047 return -1;
1048 }
1049 new_size = (length + 1) * char_size;
1050
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1052 {
1053 PyObject_DEL(_PyUnicode_UTF8(unicode));
1054 _PyUnicode_UTF8(unicode) = NULL;
1055 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1056 }
1057
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 data = (PyObject *)PyObject_REALLOC(data, new_size);
1059 if (data == NULL) {
1060 PyErr_NoMemory();
1061 return -1;
1062 }
1063 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001066 _PyUnicode_WSTR_LENGTH(unicode) = length;
1067 }
1068 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001069 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001070 _PyUnicode_UTF8_LENGTH(unicode) = length;
1071 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 _PyUnicode_LENGTH(unicode) = length;
1073 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001074#ifdef Py_DEBUG
1075 unicode_fill_invalid(unicode, old_length);
1076#endif
Victor Stinner95663112011-10-04 01:03:50 +02001077 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001078 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 }
Victor Stinner95663112011-10-04 01:03:50 +02001082 assert(_PyUnicode_WSTR(unicode) != NULL);
1083
1084 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001085 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001086 PyErr_NoMemory();
1087 return -1;
1088 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001090 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001091 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001092 if (!wstr) {
1093 PyErr_NoMemory();
1094 return -1;
1095 }
1096 _PyUnicode_WSTR(unicode) = wstr;
1097 _PyUnicode_WSTR(unicode)[length] = 0;
1098 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001099 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 return 0;
1101}
1102
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103static PyObject*
1104resize_copy(PyObject *unicode, Py_ssize_t length)
1105{
1106 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001107 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001109
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001110 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111
1112 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1113 if (copy == NULL)
1114 return NULL;
1115
1116 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001117 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001119 }
1120 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001122
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001123 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001124 if (w == NULL)
1125 return NULL;
1126 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1127 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001128 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001129 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001130 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001131 }
1132}
1133
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001135 Ux0000 terminated; some code (e.g. new_identifier)
1136 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137
1138 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001139 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
1141*/
1142
Alexander Belopolsky40018472011-02-26 01:02:56 +00001143static PyUnicodeObject *
1144_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001146 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
Thomas Wouters477c8d52006-05-27 19:21:47 +00001149 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 if (length == 0 && unicode_empty != NULL) {
1151 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001152 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 }
1154
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001155 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001156 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001157 return (PyUnicodeObject *)PyErr_NoMemory();
1158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 if (length < 0) {
1160 PyErr_SetString(PyExc_SystemError,
1161 "Negative size passed to _PyUnicode_New");
1162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1166 if (unicode == NULL)
1167 return NULL;
1168 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001169
1170 _PyUnicode_WSTR_LENGTH(unicode) = length;
1171 _PyUnicode_HASH(unicode) = -1;
1172 _PyUnicode_STATE(unicode).interned = 0;
1173 _PyUnicode_STATE(unicode).kind = 0;
1174 _PyUnicode_STATE(unicode).compact = 0;
1175 _PyUnicode_STATE(unicode).ready = 0;
1176 _PyUnicode_STATE(unicode).ascii = 0;
1177 _PyUnicode_DATA_ANY(unicode) = NULL;
1178 _PyUnicode_LENGTH(unicode) = 0;
1179 _PyUnicode_UTF8(unicode) = NULL;
1180 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1183 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001184 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001186 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188
Jeremy Hyltond8082792003-09-16 19:41:39 +00001189 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001190 * the caller fails before initializing str -- unicode_resize()
1191 * reads str[0], and the Keep-Alive optimization can keep memory
1192 * allocated for str alive across a call to unicode_dealloc(unicode).
1193 * We don't want unicode_resize to read uninitialized memory in
1194 * that case.
1195 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 _PyUnicode_WSTR(unicode)[0] = 0;
1197 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001198
Victor Stinner7931d9a2011-11-04 00:22:48 +01001199 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 return unicode;
1201}
1202
Victor Stinnerf42dc442011-10-02 23:33:16 +02001203static const char*
1204unicode_kind_name(PyObject *unicode)
1205{
Victor Stinner42dfd712011-10-03 14:41:45 +02001206 /* don't check consistency: unicode_kind_name() is called from
1207 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001208 if (!PyUnicode_IS_COMPACT(unicode))
1209 {
1210 if (!PyUnicode_IS_READY(unicode))
1211 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001212 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001213 {
1214 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001215 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001216 return "legacy ascii";
1217 else
1218 return "legacy latin1";
1219 case PyUnicode_2BYTE_KIND:
1220 return "legacy UCS2";
1221 case PyUnicode_4BYTE_KIND:
1222 return "legacy UCS4";
1223 default:
1224 return "<legacy invalid kind>";
1225 }
1226 }
1227 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001228 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001230 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001231 return "ascii";
1232 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001233 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001234 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001235 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001236 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001237 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001238 default:
1239 return "<invalid compact kind>";
1240 }
1241}
1242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001245char *_PyUnicode_utf8(void *unicode_raw){
1246 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001247 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248}
1249
Victor Stinnera42de742018-11-22 10:25:22 +01001250void *_PyUnicode_compact_data(void *unicode_raw) {
1251 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 return _PyUnicode_COMPACT_DATA(unicode);
1253}
Victor Stinnera42de742018-11-22 10:25:22 +01001254void *_PyUnicode_data(void *unicode_raw) {
1255 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001256 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1258 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1259 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1260 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1261 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1262 return PyUnicode_DATA(unicode);
1263}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001264
1265void
1266_PyUnicode_Dump(PyObject *op)
1267{
1268 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001269 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1270 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1271 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001272
Victor Stinnera849a4b2011-10-03 12:12:11 +02001273 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001274 {
1275 if (ascii->state.ascii)
1276 data = (ascii + 1);
1277 else
1278 data = (compact + 1);
1279 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001280 else
1281 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001282 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1283 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001284
Victor Stinnera849a4b2011-10-03 12:12:11 +02001285 if (ascii->wstr == data)
1286 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001287 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001288
Victor Stinnera3b334d2011-10-03 13:53:37 +02001289 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001290 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001291 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1292 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001293 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001294 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001295 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001296 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001297}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001298#endif
1299
1300PyObject *
1301PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1302{
1303 PyObject *obj;
1304 PyCompactUnicodeObject *unicode;
1305 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001306 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001307 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 Py_ssize_t char_size;
1309 Py_ssize_t struct_size;
1310
1311 /* Optimization for empty strings */
1312 if (size == 0 && unicode_empty != NULL) {
1313 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001314 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 }
1316
Victor Stinner9e9d6892011-10-04 01:02:02 +02001317 is_ascii = 0;
1318 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 struct_size = sizeof(PyCompactUnicodeObject);
1320 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001321 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 char_size = 1;
1323 is_ascii = 1;
1324 struct_size = sizeof(PyASCIIObject);
1325 }
1326 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001327 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 char_size = 1;
1329 }
1330 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001331 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 char_size = 2;
1333 if (sizeof(wchar_t) == 2)
1334 is_sharing = 1;
1335 }
1336 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001337 if (maxchar > MAX_UNICODE) {
1338 PyErr_SetString(PyExc_SystemError,
1339 "invalid maximum character passed to PyUnicode_New");
1340 return NULL;
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 char_size = 4;
1344 if (sizeof(wchar_t) == 4)
1345 is_sharing = 1;
1346 }
1347
1348 /* Ensure we won't overflow the size. */
1349 if (size < 0) {
1350 PyErr_SetString(PyExc_SystemError,
1351 "Negative size passed to PyUnicode_New");
1352 return NULL;
1353 }
1354 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1355 return PyErr_NoMemory();
1356
1357 /* Duplicated allocation code from _PyObject_New() instead of a call to
1358 * PyObject_New() so we are able to allocate space for the object and
1359 * it's data buffer.
1360 */
1361 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1362 if (obj == NULL)
1363 return PyErr_NoMemory();
1364 obj = PyObject_INIT(obj, &PyUnicode_Type);
1365 if (obj == NULL)
1366 return NULL;
1367
1368 unicode = (PyCompactUnicodeObject *)obj;
1369 if (is_ascii)
1370 data = ((PyASCIIObject*)obj) + 1;
1371 else
1372 data = unicode + 1;
1373 _PyUnicode_LENGTH(unicode) = size;
1374 _PyUnicode_HASH(unicode) = -1;
1375 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001376 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 _PyUnicode_STATE(unicode).compact = 1;
1378 _PyUnicode_STATE(unicode).ready = 1;
1379 _PyUnicode_STATE(unicode).ascii = is_ascii;
1380 if (is_ascii) {
1381 ((char*)data)[size] = 0;
1382 _PyUnicode_WSTR(unicode) = NULL;
1383 }
Victor Stinner8f825062012-04-27 13:55:39 +02001384 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 ((char*)data)[size] = 0;
1386 _PyUnicode_WSTR(unicode) = NULL;
1387 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001389 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 else {
1392 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001393 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001396 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 ((Py_UCS4*)data)[size] = 0;
1398 if (is_sharing) {
1399 _PyUnicode_WSTR_LENGTH(unicode) = size;
1400 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1401 }
1402 else {
1403 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1404 _PyUnicode_WSTR(unicode) = NULL;
1405 }
1406 }
Victor Stinner8f825062012-04-27 13:55:39 +02001407#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001408 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001409#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001410 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 return obj;
1412}
1413
1414#if SIZEOF_WCHAR_T == 2
1415/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1416 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001417 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418
1419 This function assumes that unicode can hold one more code point than wstr
1420 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001421static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001423 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424{
1425 const wchar_t *iter;
1426 Py_UCS4 *ucs4_out;
1427
Victor Stinner910337b2011-10-03 03:20:16 +02001428 assert(unicode != NULL);
1429 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1431 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1432
1433 for (iter = begin; iter < end; ) {
1434 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1435 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001436 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1437 && (iter+1) < end
1438 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 {
Victor Stinner551ac952011-11-29 22:58:13 +01001440 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 iter += 2;
1442 }
1443 else {
1444 *ucs4_out++ = *iter;
1445 iter++;
1446 }
1447 }
1448 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1449 _PyUnicode_GET_LENGTH(unicode)));
1450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451}
1452#endif
1453
Victor Stinnercd9950f2011-10-02 00:34:53 +02001454static int
Victor Stinner488fa492011-12-12 00:01:39 +01001455unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001456{
Victor Stinner488fa492011-12-12 00:01:39 +01001457 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001458 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001459 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460 return -1;
1461 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001462 return 0;
1463}
1464
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001465static int
1466_copy_characters(PyObject *to, Py_ssize_t to_start,
1467 PyObject *from, Py_ssize_t from_start,
1468 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001470 unsigned int from_kind, to_kind;
1471 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472
Victor Stinneree4544c2012-05-09 22:24:08 +02001473 assert(0 <= how_many);
1474 assert(0 <= from_start);
1475 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001476 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001477 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001478 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479
Victor Stinnerd3f08822012-05-29 12:57:52 +02001480 assert(PyUnicode_Check(to));
1481 assert(PyUnicode_IS_READY(to));
1482 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1483
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001484 if (how_many == 0)
1485 return 0;
1486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001490 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491
Victor Stinnerf1852262012-06-16 16:38:26 +02001492#ifdef Py_DEBUG
1493 if (!check_maxchar
1494 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1495 {
1496 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1497 Py_UCS4 ch;
1498 Py_ssize_t i;
1499 for (i=0; i < how_many; i++) {
1500 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1501 assert(ch <= to_maxchar);
1502 }
1503 }
1504#endif
1505
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001506 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001507 if (check_maxchar
1508 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1509 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001510 /* Writing Latin-1 characters into an ASCII string requires to
1511 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001512 Py_UCS4 max_char;
1513 max_char = ucs1lib_find_max_char(from_data,
1514 (Py_UCS1*)from_data + how_many);
1515 if (max_char >= 128)
1516 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 }
Christian Heimesf051e432016-09-13 20:22:02 +02001518 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001519 (char*)from_data + from_kind * from_start,
1520 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 else if (from_kind == PyUnicode_1BYTE_KIND
1523 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001524 {
1525 _PyUnicode_CONVERT_BYTES(
1526 Py_UCS1, Py_UCS2,
1527 PyUnicode_1BYTE_DATA(from) + from_start,
1528 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1529 PyUnicode_2BYTE_DATA(to) + to_start
1530 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001531 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001532 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001533 && to_kind == PyUnicode_4BYTE_KIND)
1534 {
1535 _PyUnicode_CONVERT_BYTES(
1536 Py_UCS1, Py_UCS4,
1537 PyUnicode_1BYTE_DATA(from) + from_start,
1538 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1539 PyUnicode_4BYTE_DATA(to) + to_start
1540 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001541 }
1542 else if (from_kind == PyUnicode_2BYTE_KIND
1543 && to_kind == PyUnicode_4BYTE_KIND)
1544 {
1545 _PyUnicode_CONVERT_BYTES(
1546 Py_UCS2, Py_UCS4,
1547 PyUnicode_2BYTE_DATA(from) + from_start,
1548 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1549 PyUnicode_4BYTE_DATA(to) + to_start
1550 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001551 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001553 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1554
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001555 if (!check_maxchar) {
1556 if (from_kind == PyUnicode_2BYTE_KIND
1557 && to_kind == PyUnicode_1BYTE_KIND)
1558 {
1559 _PyUnicode_CONVERT_BYTES(
1560 Py_UCS2, Py_UCS1,
1561 PyUnicode_2BYTE_DATA(from) + from_start,
1562 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1563 PyUnicode_1BYTE_DATA(to) + to_start
1564 );
1565 }
1566 else if (from_kind == PyUnicode_4BYTE_KIND
1567 && to_kind == PyUnicode_1BYTE_KIND)
1568 {
1569 _PyUnicode_CONVERT_BYTES(
1570 Py_UCS4, Py_UCS1,
1571 PyUnicode_4BYTE_DATA(from) + from_start,
1572 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1573 PyUnicode_1BYTE_DATA(to) + to_start
1574 );
1575 }
1576 else if (from_kind == PyUnicode_4BYTE_KIND
1577 && to_kind == PyUnicode_2BYTE_KIND)
1578 {
1579 _PyUnicode_CONVERT_BYTES(
1580 Py_UCS4, Py_UCS2,
1581 PyUnicode_4BYTE_DATA(from) + from_start,
1582 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1583 PyUnicode_2BYTE_DATA(to) + to_start
1584 );
1585 }
1586 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001587 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 }
1589 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001590 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 Py_ssize_t i;
1594
Victor Stinnera0702ab2011-09-29 14:14:38 +02001595 for (i=0; i < how_many; i++) {
1596 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001597 if (ch > to_maxchar)
1598 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1600 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001601 }
1602 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001603 return 0;
1604}
1605
Victor Stinnerd3f08822012-05-29 12:57:52 +02001606void
1607_PyUnicode_FastCopyCharacters(
1608 PyObject *to, Py_ssize_t to_start,
1609 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001610{
1611 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1612}
1613
1614Py_ssize_t
1615PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1616 PyObject *from, Py_ssize_t from_start,
1617 Py_ssize_t how_many)
1618{
1619 int err;
1620
1621 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1622 PyErr_BadInternalCall();
1623 return -1;
1624 }
1625
Benjamin Petersonbac79492012-01-14 13:34:47 -05001626 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001627 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001628 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 return -1;
1630
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001631 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001632 PyErr_SetString(PyExc_IndexError, "string index out of range");
1633 return -1;
1634 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001635 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001636 PyErr_SetString(PyExc_IndexError, "string index out of range");
1637 return -1;
1638 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001639 if (how_many < 0) {
1640 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1641 return -1;
1642 }
1643 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001644 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1645 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001646 "Cannot write %zi characters at %zi "
1647 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001648 how_many, to_start, PyUnicode_GET_LENGTH(to));
1649 return -1;
1650 }
1651
1652 if (how_many == 0)
1653 return 0;
1654
Victor Stinner488fa492011-12-12 00:01:39 +01001655 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return -1;
1657
1658 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1659 if (err) {
1660 PyErr_Format(PyExc_SystemError,
1661 "Cannot copy %s characters "
1662 "into a string of %s characters",
1663 unicode_kind_name(from),
1664 unicode_kind_name(to));
1665 return -1;
1666 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001667 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668}
1669
Victor Stinner17222162011-09-28 22:15:37 +02001670/* Find the maximum code point and count the number of surrogate pairs so a
1671 correct string length can be computed before converting a string to UCS4.
1672 This function counts single surrogates as a character and not as a pair.
1673
1674 Return 0 on success, or -1 on error. */
1675static int
1676find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1677 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678{
1679 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001680 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681
Victor Stinnerc53be962011-10-02 21:33:54 +02001682 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 *num_surrogates = 0;
1684 *maxchar = 0;
1685
1686 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001688 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1689 && (iter+1) < end
1690 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1691 {
1692 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1693 ++(*num_surrogates);
1694 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 }
1696 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001698 {
1699 ch = *iter;
1700 iter++;
1701 }
1702 if (ch > *maxchar) {
1703 *maxchar = ch;
1704 if (*maxchar > MAX_UNICODE) {
1705 PyErr_Format(PyExc_ValueError,
1706 "character U+%x is not in range [U+0000; U+10ffff]",
1707 ch);
1708 return -1;
1709 }
1710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 }
1712 return 0;
1713}
1714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715int
1716_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717{
1718 wchar_t *end;
1719 Py_UCS4 maxchar = 0;
1720 Py_ssize_t num_surrogates;
1721#if SIZEOF_WCHAR_T == 2
1722 Py_ssize_t length_wo_surrogates;
1723#endif
1724
Georg Brandl7597add2011-10-05 16:36:47 +02001725 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001726 strings were created using _PyObject_New() and where no canonical
1727 representation (the str field) has been set yet aka strings
1728 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001729 assert(_PyUnicode_CHECK(unicode));
1730 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001732 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001733 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001734 /* Actually, it should neither be interned nor be anything else: */
1735 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001738 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001739 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741
1742 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001743 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1744 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 PyErr_NoMemory();
1746 return -1;
1747 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001748 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 _PyUnicode_WSTR(unicode), end,
1750 PyUnicode_1BYTE_DATA(unicode));
1751 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1752 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1753 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1754 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001755 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001756 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 }
1759 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001760 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001761 _PyUnicode_UTF8(unicode) = NULL;
1762 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
1764 PyObject_FREE(_PyUnicode_WSTR(unicode));
1765 _PyUnicode_WSTR(unicode) = NULL;
1766 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1767 }
1768 /* In this case we might have to convert down from 4-byte native
1769 wchar_t to 2-byte unicode. */
1770 else if (maxchar < 65536) {
1771 assert(num_surrogates == 0 &&
1772 "FindMaxCharAndNumSurrogatePairs() messed up");
1773
Victor Stinner506f5922011-09-28 22:34:18 +02001774#if SIZEOF_WCHAR_T == 2
1775 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001776 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001777 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1778 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001782#else
1783 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001785 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001786 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001787 PyErr_NoMemory();
1788 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
Victor Stinner506f5922011-09-28 22:34:18 +02001790 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1791 _PyUnicode_WSTR(unicode), end,
1792 PyUnicode_2BYTE_DATA(unicode));
1793 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1794 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1795 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001796 _PyUnicode_UTF8(unicode) = NULL;
1797 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001798 PyObject_FREE(_PyUnicode_WSTR(unicode));
1799 _PyUnicode_WSTR(unicode) = NULL;
1800 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1801#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 }
1803 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1804 else {
1805#if SIZEOF_WCHAR_T == 2
1806 /* in case the native representation is 2-bytes, we need to allocate a
1807 new normalized 4-byte version. */
1808 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001809 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1810 PyErr_NoMemory();
1811 return -1;
1812 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001813 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1814 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 PyErr_NoMemory();
1816 return -1;
1817 }
1818 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1819 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001820 _PyUnicode_UTF8(unicode) = NULL;
1821 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001822 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1823 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001824 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 PyObject_FREE(_PyUnicode_WSTR(unicode));
1826 _PyUnicode_WSTR(unicode) = NULL;
1827 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1828#else
1829 assert(num_surrogates == 0);
1830
Victor Stinnerc3c74152011-10-02 20:39:55 +02001831 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1836#endif
1837 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1838 }
1839 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001840 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 return 0;
1842}
1843
Alexander Belopolsky40018472011-02-26 01:02:56 +00001844static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001845unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846{
Walter Dörwald16807132007-05-25 13:52:07 +00001847 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001848 case SSTATE_NOT_INTERNED:
1849 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001850
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 case SSTATE_INTERNED_MORTAL:
1852 /* revive dead object temporarily for DelItem */
1853 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001854 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001855 Py_FatalError(
1856 "deletion of interned string failed");
1857 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001858
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 case SSTATE_INTERNED_IMMORTAL:
1860 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001861 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001862
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 default:
1864 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001865 }
1866
Victor Stinner03490912011-10-03 23:45:12 +02001867 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001869 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001870 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001871 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1872 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001874 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875}
1876
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001877#ifdef Py_DEBUG
1878static int
1879unicode_is_singleton(PyObject *unicode)
1880{
1881 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1882 if (unicode == unicode_empty)
1883 return 1;
1884 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1885 {
1886 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1887 if (ch < 256 && unicode_latin1[ch] == unicode)
1888 return 1;
1889 }
1890 return 0;
1891}
1892#endif
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894static int
Victor Stinner488fa492011-12-12 00:01:39 +01001895unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001896{
Victor Stinner488fa492011-12-12 00:01:39 +01001897 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001898 if (Py_REFCNT(unicode) != 1)
1899 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001900 if (_PyUnicode_HASH(unicode) != -1)
1901 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 if (PyUnicode_CHECK_INTERNED(unicode))
1903 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001904 if (!PyUnicode_CheckExact(unicode))
1905 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001906#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001907 /* singleton refcount is greater than 1 */
1908 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001909#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001910 return 1;
1911}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001912
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913static int
1914unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1915{
1916 PyObject *unicode;
1917 Py_ssize_t old_length;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921
1922 assert(unicode != NULL);
1923 assert(PyUnicode_Check(unicode));
1924 assert(0 <= length);
1925
Victor Stinner910337b2011-10-03 03:20:16 +02001926 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001927 old_length = PyUnicode_WSTR_LENGTH(unicode);
1928 else
1929 old_length = PyUnicode_GET_LENGTH(unicode);
1930 if (old_length == length)
1931 return 0;
1932
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001933 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001934 _Py_INCREF_UNICODE_EMPTY();
1935 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001937 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001938 return 0;
1939 }
1940
Victor Stinner488fa492011-12-12 00:01:39 +01001941 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001942 PyObject *copy = resize_copy(unicode, length);
1943 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001945 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001946 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001947 }
1948
Victor Stinnerfe226c02011-10-03 03:52:20 +02001949 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001950 PyObject *new_unicode = resize_compact(unicode, length);
1951 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001953 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001954 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001955 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001956 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001957}
1958
Alexander Belopolsky40018472011-02-26 01:02:56 +00001959int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001960PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001961{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001962 PyObject *unicode;
1963 if (p_unicode == NULL) {
1964 PyErr_BadInternalCall();
1965 return -1;
1966 }
1967 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001968 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001969 {
1970 PyErr_BadInternalCall();
1971 return -1;
1972 }
1973 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001974}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001975
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001976/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001977
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001978 WARNING: The function doesn't copy the terminating null character and
1979 doesn't check the maximum character (may write a latin1 character in an
1980 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001981static void
1982unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1983 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001984{
1985 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1986 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001987 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001988
1989 switch (kind) {
1990 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001991 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001992#ifdef Py_DEBUG
1993 if (PyUnicode_IS_ASCII(unicode)) {
1994 Py_UCS4 maxchar = ucs1lib_find_max_char(
1995 (const Py_UCS1*)str,
1996 (const Py_UCS1*)str + len);
1997 assert(maxchar < 128);
1998 }
1999#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002000 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002001 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002002 }
2003 case PyUnicode_2BYTE_KIND: {
2004 Py_UCS2 *start = (Py_UCS2 *)data + index;
2005 Py_UCS2 *ucs2 = start;
2006 assert(index <= PyUnicode_GET_LENGTH(unicode));
2007
Victor Stinner184252a2012-06-16 02:57:41 +02002008 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002009 *ucs2 = (Py_UCS2)*str;
2010
2011 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002012 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002013 }
2014 default: {
2015 Py_UCS4 *start = (Py_UCS4 *)data + index;
2016 Py_UCS4 *ucs4 = start;
2017 assert(kind == PyUnicode_4BYTE_KIND);
2018 assert(index <= PyUnicode_GET_LENGTH(unicode));
2019
Victor Stinner184252a2012-06-16 02:57:41 +02002020 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002021 *ucs4 = (Py_UCS4)*str;
2022
2023 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002024 }
2025 }
2026}
2027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028static PyObject*
2029get_latin1_char(unsigned char ch)
2030{
Victor Stinnera464fc12011-10-02 20:39:30 +02002031 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002033 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 if (!unicode)
2035 return NULL;
2036 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002037 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 unicode_latin1[ch] = unicode;
2039 }
2040 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002041 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042}
2043
Victor Stinner985a82a2014-01-03 12:53:47 +01002044static PyObject*
2045unicode_char(Py_UCS4 ch)
2046{
2047 PyObject *unicode;
2048
2049 assert(ch <= MAX_UNICODE);
2050
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002051 if (ch < 256)
2052 return get_latin1_char(ch);
2053
Victor Stinner985a82a2014-01-03 12:53:47 +01002054 unicode = PyUnicode_New(1, ch);
2055 if (unicode == NULL)
2056 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002057
2058 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2059 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002060 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002061 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002062 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2063 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2064 }
2065 assert(_PyUnicode_CheckConsistency(unicode, 1));
2066 return unicode;
2067}
2068
Alexander Belopolsky40018472011-02-26 01:02:56 +00002069PyObject *
2070PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002072 if (u == NULL)
2073 return (PyObject*)_PyUnicode_New(size);
2074
2075 if (size < 0) {
2076 PyErr_BadInternalCall();
2077 return NULL;
2078 }
2079
2080 return PyUnicode_FromWideChar(u, size);
2081}
2082
2083PyObject *
2084PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2085{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002086 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 Py_UCS4 maxchar = 0;
2088 Py_ssize_t num_surrogates;
2089
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002090 if (u == NULL && size != 0) {
2091 PyErr_BadInternalCall();
2092 return NULL;
2093 }
2094
2095 if (size == -1) {
2096 size = wcslen(u);
2097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099 /* If the Unicode data is known at construction time, we can apply
2100 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002103 if (size == 0)
2104 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 /* Single character Unicode objects in the Latin-1 range are
2107 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002108 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002109 return get_latin1_char((unsigned char)*u);
2110
2111 /* If not empty and not single character, copy the Unicode data
2112 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002113 if (find_maxchar_surrogates(u, u + size,
2114 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115 return NULL;
2116
Victor Stinner8faf8212011-12-08 22:14:11 +01002117 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 if (!unicode)
2119 return NULL;
2120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 switch (PyUnicode_KIND(unicode)) {
2122 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002123 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002124 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2125 break;
2126 case PyUnicode_2BYTE_KIND:
2127#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002128 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002130 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2132#endif
2133 break;
2134 case PyUnicode_4BYTE_KIND:
2135#if SIZEOF_WCHAR_T == 2
2136 /* This is the only case which has to process surrogates, thus
2137 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002138 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139#else
2140 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002141 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142#endif
2143 break;
2144 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002145 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002148 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149}
2150
Alexander Belopolsky40018472011-02-26 01:02:56 +00002151PyObject *
2152PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 if (size < 0) {
2155 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002156 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002157 return NULL;
2158 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002159 if (u != NULL)
2160 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2161 else
2162 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002163}
2164
Alexander Belopolsky40018472011-02-26 01:02:56 +00002165PyObject *
2166PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002167{
2168 size_t size = strlen(u);
2169 if (size > PY_SSIZE_T_MAX) {
2170 PyErr_SetString(PyExc_OverflowError, "input too long");
2171 return NULL;
2172 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002173 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002174}
2175
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002176PyObject *
2177_PyUnicode_FromId(_Py_Identifier *id)
2178{
2179 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002180 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2181 strlen(id->string),
2182 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002183 if (!id->object)
2184 return NULL;
2185 PyUnicode_InternInPlace(&id->object);
2186 assert(!id->next);
2187 id->next = static_strings;
2188 static_strings = id;
2189 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002190 return id->object;
2191}
2192
2193void
2194_PyUnicode_ClearStaticStrings()
2195{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002196 _Py_Identifier *tmp, *s = static_strings;
2197 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002198 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002199 tmp = s->next;
2200 s->next = NULL;
2201 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002202 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002203 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002204}
2205
Benjamin Peterson0df54292012-03-26 14:50:32 -04002206/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002207
Victor Stinnerd3f08822012-05-29 12:57:52 +02002208PyObject*
2209_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002210{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002211 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002212 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002213 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002214#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002215 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002216#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002217 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002218 }
Victor Stinner785938e2011-12-11 20:09:03 +01002219 unicode = PyUnicode_New(size, 127);
2220 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002221 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002222 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2223 assert(_PyUnicode_CheckConsistency(unicode, 1));
2224 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002225}
2226
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002227static Py_UCS4
2228kind_maxchar_limit(unsigned int kind)
2229{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002230 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002231 case PyUnicode_1BYTE_KIND:
2232 return 0x80;
2233 case PyUnicode_2BYTE_KIND:
2234 return 0x100;
2235 case PyUnicode_4BYTE_KIND:
2236 return 0x10000;
2237 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002238 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002239 }
2240}
2241
Victor Stinner702c7342011-10-05 13:50:52 +02002242static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002243_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002247
Serhiy Storchaka678db842013-01-26 12:16:36 +02002248 if (size == 0)
2249 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002250 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002251 if (size == 1)
2252 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002254 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 if (!res)
2257 return NULL;
2258 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002259 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002261}
2262
Victor Stinnere57b1c02011-09-28 22:20:48 +02002263static PyObject*
2264_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265{
2266 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002267 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002268
Serhiy Storchaka678db842013-01-26 12:16:36 +02002269 if (size == 0)
2270 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002271 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002272 if (size == 1)
2273 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002274
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002275 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002276 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 if (!res)
2278 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002279 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002281 else {
2282 _PyUnicode_CONVERT_BYTES(
2283 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2284 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002285 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 return res;
2287}
2288
Victor Stinnere57b1c02011-09-28 22:20:48 +02002289static PyObject*
2290_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291{
2292 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002293 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002294
Serhiy Storchaka678db842013-01-26 12:16:36 +02002295 if (size == 0)
2296 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002297 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002298 if (size == 1)
2299 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002300
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002301 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002302 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 if (!res)
2304 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002305 if (max_char < 256)
2306 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2307 PyUnicode_1BYTE_DATA(res));
2308 else if (max_char < 0x10000)
2309 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2310 PyUnicode_2BYTE_DATA(res));
2311 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002313 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 return res;
2315}
2316
2317PyObject*
2318PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2319{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002320 if (size < 0) {
2321 PyErr_SetString(PyExc_ValueError, "size must be positive");
2322 return NULL;
2323 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002324 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002330 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002331 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002332 PyErr_SetString(PyExc_SystemError, "invalid kind");
2333 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002335}
2336
Victor Stinnerece58de2012-04-23 23:36:38 +02002337Py_UCS4
2338_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2339{
2340 enum PyUnicode_Kind kind;
2341 void *startptr, *endptr;
2342
2343 assert(PyUnicode_IS_READY(unicode));
2344 assert(0 <= start);
2345 assert(end <= PyUnicode_GET_LENGTH(unicode));
2346 assert(start <= end);
2347
2348 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2349 return PyUnicode_MAX_CHAR_VALUE(unicode);
2350
2351 if (start == end)
2352 return 127;
2353
Victor Stinner94d558b2012-04-27 22:26:58 +02002354 if (PyUnicode_IS_ASCII(unicode))
2355 return 127;
2356
Victor Stinnerece58de2012-04-23 23:36:38 +02002357 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002358 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002359 endptr = (char *)startptr + end * kind;
2360 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002361 switch(kind) {
2362 case PyUnicode_1BYTE_KIND:
2363 return ucs1lib_find_max_char(startptr, endptr);
2364 case PyUnicode_2BYTE_KIND:
2365 return ucs2lib_find_max_char(startptr, endptr);
2366 case PyUnicode_4BYTE_KIND:
2367 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002368 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002369 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002370 }
2371}
2372
Victor Stinner25a4b292011-10-06 12:31:55 +02002373/* Ensure that a string uses the most efficient storage, if it is not the
2374 case: create a new string with of the right kind. Write NULL into *p_unicode
2375 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002376static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002377unicode_adjust_maxchar(PyObject **p_unicode)
2378{
2379 PyObject *unicode, *copy;
2380 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002381 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002382 unsigned int kind;
2383
2384 assert(p_unicode != NULL);
2385 unicode = *p_unicode;
2386 assert(PyUnicode_IS_READY(unicode));
2387 if (PyUnicode_IS_ASCII(unicode))
2388 return;
2389
2390 len = PyUnicode_GET_LENGTH(unicode);
2391 kind = PyUnicode_KIND(unicode);
2392 if (kind == PyUnicode_1BYTE_KIND) {
2393 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002394 max_char = ucs1lib_find_max_char(u, u + len);
2395 if (max_char >= 128)
2396 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002397 }
2398 else if (kind == PyUnicode_2BYTE_KIND) {
2399 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002400 max_char = ucs2lib_find_max_char(u, u + len);
2401 if (max_char >= 256)
2402 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002403 }
2404 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002406 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002407 max_char = ucs4lib_find_max_char(u, u + len);
2408 if (max_char >= 0x10000)
2409 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002410 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002411 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002412 if (copy != NULL)
2413 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002414 Py_DECREF(unicode);
2415 *p_unicode = copy;
2416}
2417
Victor Stinner034f6cf2011-09-30 02:26:44 +02002418PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002419_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002420{
Victor Stinner87af4f22011-11-21 23:03:47 +01002421 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002422 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002423
Victor Stinner034f6cf2011-09-30 02:26:44 +02002424 if (!PyUnicode_Check(unicode)) {
2425 PyErr_BadInternalCall();
2426 return NULL;
2427 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002428 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002429 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002430
Victor Stinner87af4f22011-11-21 23:03:47 +01002431 length = PyUnicode_GET_LENGTH(unicode);
2432 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002433 if (!copy)
2434 return NULL;
2435 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2436
Christian Heimesf051e432016-09-13 20:22:02 +02002437 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002438 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002439 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002440 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002441}
2442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443
Victor Stinnerbc603d12011-10-02 01:00:40 +02002444/* Widen Unicode objects to larger buffers. Don't write terminating null
2445 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446
2447void*
2448_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2449{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002450 Py_ssize_t len;
2451 void *result;
2452 unsigned int skind;
2453
Benjamin Petersonbac79492012-01-14 13:34:47 -05002454 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 return NULL;
2456
2457 len = PyUnicode_GET_LENGTH(s);
2458 skind = PyUnicode_KIND(s);
2459 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002460 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 return NULL;
2462 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002463 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002464 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002465 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002466 if (!result)
2467 return PyErr_NoMemory();
2468 assert(skind == PyUnicode_1BYTE_KIND);
2469 _PyUnicode_CONVERT_BYTES(
2470 Py_UCS1, Py_UCS2,
2471 PyUnicode_1BYTE_DATA(s),
2472 PyUnicode_1BYTE_DATA(s) + len,
2473 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002475 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002476 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002477 if (!result)
2478 return PyErr_NoMemory();
2479 if (skind == PyUnicode_2BYTE_KIND) {
2480 _PyUnicode_CONVERT_BYTES(
2481 Py_UCS2, Py_UCS4,
2482 PyUnicode_2BYTE_DATA(s),
2483 PyUnicode_2BYTE_DATA(s) + len,
2484 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002486 else {
2487 assert(skind == PyUnicode_1BYTE_KIND);
2488 _PyUnicode_CONVERT_BYTES(
2489 Py_UCS1, Py_UCS4,
2490 PyUnicode_1BYTE_DATA(s),
2491 PyUnicode_1BYTE_DATA(s) + len,
2492 result);
2493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002495 default:
2496 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 }
Victor Stinner01698042011-10-04 00:04:26 +02002498 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 return NULL;
2500}
2501
2502static Py_UCS4*
2503as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2504 int copy_null)
2505{
2506 int kind;
2507 void *data;
2508 Py_ssize_t len, targetlen;
2509 if (PyUnicode_READY(string) == -1)
2510 return NULL;
2511 kind = PyUnicode_KIND(string);
2512 data = PyUnicode_DATA(string);
2513 len = PyUnicode_GET_LENGTH(string);
2514 targetlen = len;
2515 if (copy_null)
2516 targetlen++;
2517 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002518 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 if (!target) {
2520 PyErr_NoMemory();
2521 return NULL;
2522 }
2523 }
2524 else {
2525 if (targetsize < targetlen) {
2526 PyErr_Format(PyExc_SystemError,
2527 "string is longer than the buffer");
2528 if (copy_null && 0 < targetsize)
2529 target[0] = 0;
2530 return NULL;
2531 }
2532 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002533 if (kind == PyUnicode_1BYTE_KIND) {
2534 Py_UCS1 *start = (Py_UCS1 *) data;
2535 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002537 else if (kind == PyUnicode_2BYTE_KIND) {
2538 Py_UCS2 *start = (Py_UCS2 *) data;
2539 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2540 }
2541 else {
2542 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002543 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 if (copy_null)
2546 target[len] = 0;
2547 return target;
2548}
2549
2550Py_UCS4*
2551PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2552 int copy_null)
2553{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002554 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555 PyErr_BadInternalCall();
2556 return NULL;
2557 }
2558 return as_ucs4(string, target, targetsize, copy_null);
2559}
2560
2561Py_UCS4*
2562PyUnicode_AsUCS4Copy(PyObject *string)
2563{
2564 return as_ucs4(string, NULL, 0, 1);
2565}
2566
Victor Stinner15a11362012-10-06 23:48:20 +02002567/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002568 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2569 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2570#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002571
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572static int
2573unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 Py_ssize_t length, fill, arglen;
2577 Py_UCS4 maxchar;
2578
2579 if (PyUnicode_READY(str) == -1)
2580 return -1;
2581
2582 length = PyUnicode_GET_LENGTH(str);
2583 if ((precision == -1 || precision >= length)
2584 && width <= length)
2585 return _PyUnicodeWriter_WriteStr(writer, str);
2586
2587 if (precision != -1)
2588 length = Py_MIN(precision, length);
2589
2590 arglen = Py_MAX(length, width);
2591 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2592 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2593 else
2594 maxchar = writer->maxchar;
2595
2596 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2597 return -1;
2598
2599 if (width > length) {
2600 fill = width - length;
2601 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2602 return -1;
2603 writer->pos += fill;
2604 }
2605
2606 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2607 str, 0, length);
2608 writer->pos += length;
2609 return 0;
2610}
2611
2612static int
Victor Stinner998b8062018-09-12 00:23:25 +02002613unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614 Py_ssize_t width, Py_ssize_t precision)
2615{
2616 /* UTF-8 */
2617 Py_ssize_t length;
2618 PyObject *unicode;
2619 int res;
2620
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002621 if (precision == -1) {
2622 length = strlen(str);
2623 }
2624 else {
2625 length = 0;
2626 while (length < precision && str[length]) {
2627 length++;
2628 }
2629 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2631 if (unicode == NULL)
2632 return -1;
2633
2634 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2635 Py_DECREF(unicode);
2636 return res;
2637}
2638
Victor Stinner96865452011-03-01 23:44:09 +00002639static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002640unicode_fromformat_arg(_PyUnicodeWriter *writer,
2641 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002642{
Victor Stinnere215d962012-10-06 23:03:36 +02002643 const char *p;
2644 Py_ssize_t len;
2645 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002646 Py_ssize_t width;
2647 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002648 int longflag;
2649 int longlongflag;
2650 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002651 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002652
2653 p = f;
2654 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002655 zeropad = 0;
2656 if (*f == '0') {
2657 zeropad = 1;
2658 f++;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660
2661 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002662 width = -1;
2663 if (Py_ISDIGIT((unsigned)*f)) {
2664 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002665 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002668 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002669 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002670 return NULL;
2671 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002673 f++;
2674 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 }
2676 precision = -1;
2677 if (*f == '.') {
2678 f++;
2679 if (Py_ISDIGIT((unsigned)*f)) {
2680 precision = (*f - '0');
2681 f++;
2682 while (Py_ISDIGIT((unsigned)*f)) {
2683 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2684 PyErr_SetString(PyExc_ValueError,
2685 "precision too big");
2686 return NULL;
2687 }
2688 precision = (precision * 10) + (*f - '0');
2689 f++;
2690 }
2691 }
Victor Stinner96865452011-03-01 23:44:09 +00002692 if (*f == '%') {
2693 /* "%.3%s" => f points to "3" */
2694 f--;
2695 }
2696 }
2697 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002698 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002699 f--;
2700 }
Victor Stinner96865452011-03-01 23:44:09 +00002701
2702 /* Handle %ld, %lu, %lld and %llu. */
2703 longflag = 0;
2704 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002705 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002706 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002707 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002708 longflag = 1;
2709 ++f;
2710 }
Victor Stinner96865452011-03-01 23:44:09 +00002711 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002712 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002713 longlongflag = 1;
2714 f += 2;
2715 }
Victor Stinner96865452011-03-01 23:44:09 +00002716 }
2717 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002718 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002719 size_tflag = 1;
2720 ++f;
2721 }
Victor Stinnere215d962012-10-06 23:03:36 +02002722
2723 if (f[1] == '\0')
2724 writer->overallocate = 0;
2725
2726 switch (*f) {
2727 case 'c':
2728 {
2729 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002730 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002731 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002732 "character argument not in range(0x110000)");
2733 return NULL;
2734 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002735 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002736 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002737 break;
2738 }
2739
2740 case 'i':
2741 case 'd':
2742 case 'u':
2743 case 'x':
2744 {
2745 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002746 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002748
2749 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002750 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002751 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002752 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002753 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002754 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002755 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002756 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002757 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002758 va_arg(*vargs, size_t));
2759 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002760 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002761 va_arg(*vargs, unsigned int));
2762 }
2763 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002764 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002765 }
2766 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002767 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002768 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002769 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002770 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002771 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002772 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002773 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002774 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002775 va_arg(*vargs, Py_ssize_t));
2776 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002777 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002778 va_arg(*vargs, int));
2779 }
2780 assert(len >= 0);
2781
Victor Stinnere215d962012-10-06 23:03:36 +02002782 if (precision < len)
2783 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784
2785 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2787 return NULL;
2788
Victor Stinnere215d962012-10-06 23:03:36 +02002789 if (width > precision) {
2790 Py_UCS4 fillchar;
2791 fill = width - precision;
2792 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002793 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2794 return NULL;
2795 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002796 }
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002798 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002799 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2800 return NULL;
2801 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002802 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803
Victor Stinner4a587072013-11-19 12:54:53 +01002804 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2805 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002806 break;
2807 }
2808
2809 case 'p':
2810 {
2811 char number[MAX_LONG_LONG_CHARS];
2812
2813 len = sprintf(number, "%p", va_arg(*vargs, void*));
2814 assert(len >= 0);
2815
2816 /* %p is ill-defined: ensure leading 0x. */
2817 if (number[1] == 'X')
2818 number[1] = 'x';
2819 else if (number[1] != 'x') {
2820 memmove(number + 2, number,
2821 strlen(number) + 1);
2822 number[0] = '0';
2823 number[1] = 'x';
2824 len += 2;
2825 }
2826
Victor Stinner4a587072013-11-19 12:54:53 +01002827 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002828 return NULL;
2829 break;
2830 }
2831
2832 case 's':
2833 {
2834 /* UTF-8 */
2835 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002836 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002837 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 break;
2839 }
2840
2841 case 'U':
2842 {
2843 PyObject *obj = va_arg(*vargs, PyObject *);
2844 assert(obj && _PyUnicode_CHECK(obj));
2845
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
2848 break;
2849 }
2850
2851 case 'V':
2852 {
2853 PyObject *obj = va_arg(*vargs, PyObject *);
2854 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002855 if (obj) {
2856 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002857 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002858 return NULL;
2859 }
2860 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002861 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002862 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002863 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002864 }
2865 break;
2866 }
2867
2868 case 'S':
2869 {
2870 PyObject *obj = va_arg(*vargs, PyObject *);
2871 PyObject *str;
2872 assert(obj);
2873 str = PyObject_Str(obj);
2874 if (!str)
2875 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002876 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002877 Py_DECREF(str);
2878 return NULL;
2879 }
2880 Py_DECREF(str);
2881 break;
2882 }
2883
2884 case 'R':
2885 {
2886 PyObject *obj = va_arg(*vargs, PyObject *);
2887 PyObject *repr;
2888 assert(obj);
2889 repr = PyObject_Repr(obj);
2890 if (!repr)
2891 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002892 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002893 Py_DECREF(repr);
2894 return NULL;
2895 }
2896 Py_DECREF(repr);
2897 break;
2898 }
2899
2900 case 'A':
2901 {
2902 PyObject *obj = va_arg(*vargs, PyObject *);
2903 PyObject *ascii;
2904 assert(obj);
2905 ascii = PyObject_ASCII(obj);
2906 if (!ascii)
2907 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002908 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002909 Py_DECREF(ascii);
2910 return NULL;
2911 }
2912 Py_DECREF(ascii);
2913 break;
2914 }
2915
2916 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002917 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002918 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002919 break;
2920
2921 default:
2922 /* if we stumble upon an unknown formatting code, copy the rest
2923 of the format string to the output string. (we cannot just
2924 skip the code, since there's no way to know what's in the
2925 argument list) */
2926 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002927 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002928 return NULL;
2929 f = p+len;
2930 return f;
2931 }
2932
2933 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002934 return f;
2935}
2936
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937PyObject *
2938PyUnicode_FromFormatV(const char *format, va_list vargs)
2939{
Victor Stinnere215d962012-10-06 23:03:36 +02002940 va_list vargs2;
2941 const char *f;
2942 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
Victor Stinner8f674cc2013-04-17 23:02:17 +02002944 _PyUnicodeWriter_Init(&writer);
2945 writer.min_length = strlen(format) + 100;
2946 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002947
Benjamin Peterson0c212142016-09-20 20:39:33 -07002948 // Copy varags to be able to pass a reference to a subfunction.
2949 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002950
2951 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002953 f = unicode_fromformat_arg(&writer, f, &vargs2);
2954 if (f == NULL)
2955 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002958 const char *p;
2959 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002960
Victor Stinnere215d962012-10-06 23:03:36 +02002961 p = f;
2962 do
2963 {
2964 if ((unsigned char)*p > 127) {
2965 PyErr_Format(PyExc_ValueError,
2966 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2967 "string, got a non-ASCII byte: 0x%02x",
2968 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002969 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002970 }
2971 p++;
2972 }
2973 while (*p != '\0' && *p != '%');
2974 len = p - f;
2975
2976 if (*p == '\0')
2977 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002978
2979 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002980 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002981
2982 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002984 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002985 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002986 return _PyUnicodeWriter_Finish(&writer);
2987
2988 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002989 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002991 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992}
2993
Walter Dörwaldd2034312007-05-18 16:29:38 +00002994PyObject *
2995PyUnicode_FromFormat(const char *format, ...)
2996{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002997 PyObject* ret;
2998 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002999
3000#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003005 ret = PyUnicode_FromFormatV(format, vargs);
3006 va_end(vargs);
3007 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003008}
3009
Serhiy Storchakac46db922018-10-23 22:58:24 +03003010static Py_ssize_t
3011unicode_get_widechar_size(PyObject *unicode)
3012{
3013 Py_ssize_t res;
3014
3015 assert(unicode != NULL);
3016 assert(_PyUnicode_CHECK(unicode));
3017
3018 if (_PyUnicode_WSTR(unicode) != NULL) {
3019 return PyUnicode_WSTR_LENGTH(unicode);
3020 }
3021 assert(PyUnicode_IS_READY(unicode));
3022
3023 res = _PyUnicode_LENGTH(unicode);
3024#if SIZEOF_WCHAR_T == 2
3025 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3026 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3027 const Py_UCS4 *end = s + res;
3028 for (; s < end; ++s) {
3029 if (*s > 0xFFFF) {
3030 ++res;
3031 }
3032 }
3033 }
3034#endif
3035 return res;
3036}
3037
3038static void
3039unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3040{
3041 const wchar_t *wstr;
3042
3043 assert(unicode != NULL);
3044 assert(_PyUnicode_CHECK(unicode));
3045
3046 wstr = _PyUnicode_WSTR(unicode);
3047 if (wstr != NULL) {
3048 memcpy(w, wstr, size * sizeof(wchar_t));
3049 return;
3050 }
3051 assert(PyUnicode_IS_READY(unicode));
3052
3053 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3054 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3055 for (; size--; ++s, ++w) {
3056 *w = *s;
3057 }
3058 }
3059 else {
3060#if SIZEOF_WCHAR_T == 4
3061 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3062 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3063 for (; size--; ++s, ++w) {
3064 *w = *s;
3065 }
3066#else
3067 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3068 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3069 for (; size--; ++s, ++w) {
3070 Py_UCS4 ch = *s;
3071 if (ch > 0xFFFF) {
3072 assert(ch <= MAX_UNICODE);
3073 /* encode surrogate pair in this case */
3074 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3075 if (!size--)
3076 break;
3077 *w = Py_UNICODE_LOW_SURROGATE(ch);
3078 }
3079 else {
3080 *w = ch;
3081 }
3082 }
3083#endif
3084 }
3085}
3086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087#ifdef HAVE_WCHAR_H
3088
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003089/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003090
Victor Stinnerd88d9832011-09-06 02:00:05 +02003091 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003092 character) required to convert the unicode object. Ignore size argument.
3093
Victor Stinnerd88d9832011-09-06 02:00:05 +02003094 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003095 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003096 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003097Py_ssize_t
3098PyUnicode_AsWideChar(PyObject *unicode,
3099 wchar_t *w,
3100 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003101{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003102 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003103
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003104 if (unicode == NULL) {
3105 PyErr_BadInternalCall();
3106 return -1;
3107 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003108 if (!PyUnicode_Check(unicode)) {
3109 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003111 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003112
3113 res = unicode_get_widechar_size(unicode);
3114 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003116 }
3117
3118 if (size > res) {
3119 size = res + 1;
3120 }
3121 else {
3122 res = size;
3123 }
3124 unicode_copy_as_widechar(unicode, w, size);
3125 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003126}
3127
Victor Stinner137c34c2010-09-29 10:25:54 +00003128wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003129PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003130 Py_ssize_t *size)
3131{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003132 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003133 Py_ssize_t buflen;
3134
3135 if (unicode == NULL) {
3136 PyErr_BadInternalCall();
3137 return NULL;
3138 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003139 if (!PyUnicode_Check(unicode)) {
3140 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003141 return NULL;
3142 }
3143
Serhiy Storchakac46db922018-10-23 22:58:24 +03003144 buflen = unicode_get_widechar_size(unicode);
3145 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003146 if (buffer == NULL) {
3147 PyErr_NoMemory();
3148 return NULL;
3149 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003150 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3151 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003152 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003153 }
3154 else if (wcslen(buffer) != (size_t)buflen) {
3155 PyMem_FREE(buffer);
3156 PyErr_SetString(PyExc_ValueError,
3157 "embedded null character");
3158 return NULL;
3159 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003160 return buffer;
3161}
3162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164
Alexander Belopolsky40018472011-02-26 01:02:56 +00003165PyObject *
3166PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003167{
Victor Stinner8faf8212011-12-08 22:14:11 +01003168 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 PyErr_SetString(PyExc_ValueError,
3170 "chr() arg not in range(0x110000)");
3171 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003172 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003173
Victor Stinner985a82a2014-01-03 12:53:47 +01003174 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003175}
3176
Alexander Belopolsky40018472011-02-26 01:02:56 +00003177PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003178PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003180 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003182 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003183 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003184 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 Py_INCREF(obj);
3186 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003187 }
3188 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 /* For a Unicode subtype that's not a Unicode object,
3190 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003191 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003192 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003193 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003194 "Can't convert '%.100s' object to str implicitly",
3195 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003196 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003197}
3198
Alexander Belopolsky40018472011-02-26 01:02:56 +00003199PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003200PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003201 const char *encoding,
3202 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003204 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003205 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003206
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 PyErr_BadInternalCall();
3209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003211
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003212 /* Decoding bytes objects is the most common case and should be fast */
3213 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003214 if (PyBytes_GET_SIZE(obj) == 0)
3215 _Py_RETURN_UNICODE_EMPTY();
3216 v = PyUnicode_Decode(
3217 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3218 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003219 return v;
3220 }
3221
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003222 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 PyErr_SetString(PyExc_TypeError,
3224 "decoding str is not supported");
3225 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003226 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003227
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003228 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3229 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3230 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003231 "decoding to str: need a bytes-like object, %.80s found",
3232 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003233 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003234 }
Tim Petersced69f82003-09-16 20:30:58 +00003235
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003236 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003237 PyBuffer_Release(&buffer);
3238 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003240
Serhiy Storchaka05997252013-01-26 12:14:02 +02003241 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003242 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003243 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244}
3245
Victor Stinnerebe17e02016-10-12 13:57:45 +02003246/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3247 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3248 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003249int
3250_Py_normalize_encoding(const char *encoding,
3251 char *lower,
3252 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003254 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003255 char *l;
3256 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003257 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258
Victor Stinner942889a2016-09-05 15:40:10 -07003259 assert(encoding != NULL);
3260
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003261 e = encoding;
3262 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003263 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003264 punct = 0;
3265 while (1) {
3266 char c = *e;
3267 if (c == 0) {
3268 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003269 }
Victor Stinner942889a2016-09-05 15:40:10 -07003270
3271 if (Py_ISALNUM(c) || c == '.') {
3272 if (punct && l != lower) {
3273 if (l == l_end) {
3274 return 0;
3275 }
3276 *l++ = '_';
3277 }
3278 punct = 0;
3279
3280 if (l == l_end) {
3281 return 0;
3282 }
3283 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003284 }
3285 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003286 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003287 }
Victor Stinner942889a2016-09-05 15:40:10 -07003288
3289 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003290 }
3291 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003292 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 Py_ssize_t size,
3298 const char *encoding,
3299 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003300{
3301 PyObject *buffer = NULL, *unicode;
3302 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003303 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3304
3305 if (encoding == NULL) {
3306 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3307 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003308
Fred Drakee4315f52000-05-09 19:53:39 +00003309 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003310 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3311 char *lower = buflower;
3312
3313 /* Fast paths */
3314 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3315 lower += 3;
3316 if (*lower == '_') {
3317 /* Match "utf8" and "utf_8" */
3318 lower++;
3319 }
3320
3321 if (lower[0] == '8' && lower[1] == 0) {
3322 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3323 }
3324 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3325 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3326 }
3327 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3328 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3329 }
3330 }
3331 else {
3332 if (strcmp(lower, "ascii") == 0
3333 || strcmp(lower, "us_ascii") == 0) {
3334 return PyUnicode_DecodeASCII(s, size, errors);
3335 }
Steve Dowercc16be82016-09-08 10:35:16 -07003336 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003337 else if (strcmp(lower, "mbcs") == 0) {
3338 return PyUnicode_DecodeMBCS(s, size, errors);
3339 }
3340 #endif
3341 else if (strcmp(lower, "latin1") == 0
3342 || strcmp(lower, "latin_1") == 0
3343 || strcmp(lower, "iso_8859_1") == 0
3344 || strcmp(lower, "iso8859_1") == 0) {
3345 return PyUnicode_DecodeLatin1(s, size, errors);
3346 }
3347 }
Victor Stinner37296e82010-06-10 13:36:23 +00003348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349
3350 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003351 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003352 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003353 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003354 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 if (buffer == NULL)
3356 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003357 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 if (unicode == NULL)
3359 goto onError;
3360 if (!PyUnicode_Check(unicode)) {
3361 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003362 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003363 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003364 encoding,
3365 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 Py_DECREF(unicode);
3367 goto onError;
3368 }
3369 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003370 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003371
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 Py_XDECREF(buffer);
3374 return NULL;
3375}
3376
Alexander Belopolsky40018472011-02-26 01:02:56 +00003377PyObject *
3378PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003379 const char *encoding,
3380 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003381{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003382 if (!PyUnicode_Check(unicode)) {
3383 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003384 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003385 }
3386
Serhiy Storchaka00939072016-10-27 21:05:49 +03003387 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3388 "PyUnicode_AsDecodedObject() is deprecated; "
3389 "use PyCodec_Decode() to decode from str", 1) < 0)
3390 return NULL;
3391
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003394
3395 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003396 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003397}
3398
Alexander Belopolsky40018472011-02-26 01:02:56 +00003399PyObject *
3400PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003401 const char *encoding,
3402 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003403{
3404 PyObject *v;
3405
3406 if (!PyUnicode_Check(unicode)) {
3407 PyErr_BadArgument();
3408 goto onError;
3409 }
3410
Serhiy Storchaka00939072016-10-27 21:05:49 +03003411 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3412 "PyUnicode_AsDecodedUnicode() is deprecated; "
3413 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3414 return NULL;
3415
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003418
3419 /* Decode via the codec registry */
3420 v = PyCodec_Decode(unicode, encoding, errors);
3421 if (v == NULL)
3422 goto onError;
3423 if (!PyUnicode_Check(v)) {
3424 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003425 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003426 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003427 encoding,
3428 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003429 Py_DECREF(v);
3430 goto onError;
3431 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003432 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 Py_ssize_t size,
3441 const char *encoding,
3442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443{
3444 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003445
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003446 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3450 Py_DECREF(unicode);
3451 return v;
3452}
3453
Alexander Belopolsky40018472011-02-26 01:02:56 +00003454PyObject *
3455PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003456 const char *encoding,
3457 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003458{
3459 PyObject *v;
3460
3461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
3463 goto onError;
3464 }
3465
Serhiy Storchaka00939072016-10-27 21:05:49 +03003466 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3467 "PyUnicode_AsEncodedObject() is deprecated; "
3468 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3469 "or PyCodec_Encode() for generic encoding", 1) < 0)
3470 return NULL;
3471
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003472 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003474
3475 /* Encode via the codec registry */
3476 v = PyCodec_Encode(unicode, encoding, errors);
3477 if (v == NULL)
3478 goto onError;
3479 return v;
3480
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003482 return NULL;
3483}
3484
Victor Stinner1b579672011-12-17 05:47:23 +01003485
Victor Stinner2cba6b82018-01-10 22:46:15 +01003486static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003487unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003490 Py_ssize_t wlen;
3491 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3492 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003494 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003495
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003496 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003497 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003498 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499 return NULL;
3500 }
3501
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003502 char *str;
3503 size_t error_pos;
3504 const char *reason;
3505 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003506 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003507 PyMem_Free(wstr);
3508
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003509 if (res != 0) {
3510 if (res == -2) {
3511 PyObject *exc;
3512 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3513 "locale", unicode,
3514 (Py_ssize_t)error_pos,
3515 (Py_ssize_t)(error_pos+1),
3516 reason);
3517 if (exc != NULL) {
3518 PyCodec_StrictErrors(exc);
3519 Py_DECREF(exc);
3520 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003521 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003522 else if (res == -3) {
3523 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3524 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003525 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003526 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003527 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003528 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003529 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003530
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003531 PyObject *bytes = PyBytes_FromString(str);
3532 PyMem_RawFree(str);
3533 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003534}
3535
Victor Stinnerad158722010-10-27 00:25:46 +00003536PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003537PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3538{
Victor Stinner709d23d2019-05-02 14:56:30 -04003539 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3540 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003541}
3542
3543PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003544PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003546 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003547#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003548 if (interp->fs_codec.encoding) {
3549 return unicode_encode_utf8(unicode,
3550 interp->fs_codec.error_handler,
3551 interp->fs_codec.errors);
3552 }
3553 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003554 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003555 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003556 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003557 assert(errors != _Py_ERROR_UNKNOWN);
3558 return unicode_encode_utf8(unicode, errors, NULL);
3559 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003560#else
Victor Stinner793b5312011-04-27 00:24:21 +02003561 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3562 cannot use it to encode and decode filenames before it is loaded. Load
3563 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003564 implementation of the locale codec until the codec registry is
3565 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003566 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003567 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003568 interp->fs_codec.encoding,
3569 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003570 }
3571 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003572 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003573 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003574 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003575 assert(errors != _Py_ERROR_UNKNOWN);
3576 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003577 }
Victor Stinnerad158722010-10-27 00:25:46 +00003578#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003579}
3580
Alexander Belopolsky40018472011-02-26 01:02:56 +00003581PyObject *
3582PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003583 const char *encoding,
3584 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585{
3586 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003587 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003588
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (!PyUnicode_Check(unicode)) {
3590 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Fred Drakee4315f52000-05-09 19:53:39 +00003593
Victor Stinner942889a2016-09-05 15:40:10 -07003594 if (encoding == NULL) {
3595 return _PyUnicode_AsUTF8String(unicode, errors);
3596 }
3597
Fred Drakee4315f52000-05-09 19:53:39 +00003598 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003599 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3600 char *lower = buflower;
3601
3602 /* Fast paths */
3603 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3604 lower += 3;
3605 if (*lower == '_') {
3606 /* Match "utf8" and "utf_8" */
3607 lower++;
3608 }
3609
3610 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003612 }
3613 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3614 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3615 }
3616 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3617 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3618 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003619 }
Victor Stinner942889a2016-09-05 15:40:10 -07003620 else {
3621 if (strcmp(lower, "ascii") == 0
3622 || strcmp(lower, "us_ascii") == 0) {
3623 return _PyUnicode_AsASCIIString(unicode, errors);
3624 }
Steve Dowercc16be82016-09-08 10:35:16 -07003625#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003626 else if (strcmp(lower, "mbcs") == 0) {
3627 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3628 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003629#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003630 else if (strcmp(lower, "latin1") == 0 ||
3631 strcmp(lower, "latin_1") == 0 ||
3632 strcmp(lower, "iso_8859_1") == 0 ||
3633 strcmp(lower, "iso8859_1") == 0) {
3634 return _PyUnicode_AsLatin1String(unicode, errors);
3635 }
3636 }
Victor Stinner37296e82010-06-10 13:36:23 +00003637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638
3639 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003640 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003642 return NULL;
3643
3644 /* The normal path */
3645 if (PyBytes_Check(v))
3646 return v;
3647
3648 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003649 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003651 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003652
3653 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003654 "encoder %s returned bytearray instead of bytes; "
3655 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003656 encoding);
3657 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003658 Py_DECREF(v);
3659 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003660 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003662 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3663 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003664 Py_DECREF(v);
3665 return b;
3666 }
3667
3668 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003669 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003670 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003671 encoding,
3672 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003673 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003674 return NULL;
3675}
3676
Alexander Belopolsky40018472011-02-26 01:02:56 +00003677PyObject *
3678PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003679 const char *encoding,
3680 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003681{
3682 PyObject *v;
3683
3684 if (!PyUnicode_Check(unicode)) {
3685 PyErr_BadArgument();
3686 goto onError;
3687 }
3688
Serhiy Storchaka00939072016-10-27 21:05:49 +03003689 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3690 "PyUnicode_AsEncodedUnicode() is deprecated; "
3691 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3692 return NULL;
3693
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003694 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003696
3697 /* Encode via the codec registry */
3698 v = PyCodec_Encode(unicode, encoding, errors);
3699 if (v == NULL)
3700 goto onError;
3701 if (!PyUnicode_Check(v)) {
3702 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003703 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003704 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003705 encoding,
3706 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003707 Py_DECREF(v);
3708 goto onError;
3709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003711
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 return NULL;
3714}
3715
Victor Stinner2cba6b82018-01-10 22:46:15 +01003716static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003717unicode_decode_locale(const char *str, Py_ssize_t len,
3718 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003720 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3721 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003722 return NULL;
3723 }
3724
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003725 wchar_t *wstr;
3726 size_t wlen;
3727 const char *reason;
3728 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003729 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003730 if (res != 0) {
3731 if (res == -2) {
3732 PyObject *exc;
3733 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3734 "locale", str, len,
3735 (Py_ssize_t)wlen,
3736 (Py_ssize_t)(wlen + 1),
3737 reason);
3738 if (exc != NULL) {
3739 PyCodec_StrictErrors(exc);
3740 Py_DECREF(exc);
3741 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003742 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003743 else if (res == -3) {
3744 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3745 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003748 }
Victor Stinner2f197072011-12-17 07:08:30 +01003749 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003750 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003751
3752 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3753 PyMem_RawFree(wstr);
3754 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755}
3756
3757PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003758PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3759 const char *errors)
3760{
Victor Stinner709d23d2019-05-02 14:56:30 -04003761 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3762 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003763}
3764
3765PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003766PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003767{
3768 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003769 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3770 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003771}
3772
3773
3774PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003775PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003776 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003777 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3778}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003779
Christian Heimes5894ba72007-11-04 11:43:14 +00003780PyObject*
3781PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3782{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003783 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003784#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003785 if (interp->fs_codec.encoding) {
3786 return unicode_decode_utf8(s, size,
3787 interp->fs_codec.error_handler,
3788 interp->fs_codec.errors,
3789 NULL);
3790 }
3791 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003792 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003794 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003795 assert(errors != _Py_ERROR_UNKNOWN);
3796 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3797 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003798#else
Victor Stinner793b5312011-04-27 00:24:21 +02003799 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3800 cannot use it to encode and decode filenames before it is loaded. Load
3801 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003802 implementation of the locale codec until the codec registry is
3803 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003805 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003806 interp->fs_codec.encoding,
3807 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003808 }
3809 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003810 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003811 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003812 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003813 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814 }
Victor Stinnerad158722010-10-27 00:25:46 +00003815#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003816}
3817
Martin v. Löwis011e8422009-05-05 04:43:17 +00003818
3819int
3820PyUnicode_FSConverter(PyObject* arg, void* addr)
3821{
Brett Cannonec6ce872016-09-06 15:50:29 -07003822 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003823 PyObject *output = NULL;
3824 Py_ssize_t size;
3825 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003826 if (arg == NULL) {
3827 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003828 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003829 return 1;
3830 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003831 path = PyOS_FSPath(arg);
3832 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003833 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003834 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003835 if (PyBytes_Check(path)) {
3836 output = path;
3837 }
3838 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3839 output = PyUnicode_EncodeFSDefault(path);
3840 Py_DECREF(path);
3841 if (!output) {
3842 return 0;
3843 }
3844 assert(PyBytes_Check(output));
3845 }
3846
Victor Stinner0ea2a462010-04-30 00:22:08 +00003847 size = PyBytes_GET_SIZE(output);
3848 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003849 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003850 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003851 Py_DECREF(output);
3852 return 0;
3853 }
3854 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003855 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856}
3857
3858
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003859int
3860PyUnicode_FSDecoder(PyObject* arg, void* addr)
3861{
Brett Cannona5711202016-09-06 19:36:01 -07003862 int is_buffer = 0;
3863 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003864 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003865 if (arg == NULL) {
3866 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003867 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003868 return 1;
3869 }
Brett Cannona5711202016-09-06 19:36:01 -07003870
3871 is_buffer = PyObject_CheckBuffer(arg);
3872 if (!is_buffer) {
3873 path = PyOS_FSPath(arg);
3874 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003875 return 0;
3876 }
Brett Cannona5711202016-09-06 19:36:01 -07003877 }
3878 else {
3879 path = arg;
3880 Py_INCREF(arg);
3881 }
3882
3883 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003884 output = path;
3885 }
3886 else if (PyBytes_Check(path) || is_buffer) {
3887 PyObject *path_bytes = NULL;
3888
3889 if (!PyBytes_Check(path) &&
3890 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003891 "path should be string, bytes, or os.PathLike, not %.200s",
3892 Py_TYPE(arg)->tp_name)) {
3893 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003894 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003895 }
3896 path_bytes = PyBytes_FromObject(path);
3897 Py_DECREF(path);
3898 if (!path_bytes) {
3899 return 0;
3900 }
3901 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3902 PyBytes_GET_SIZE(path_bytes));
3903 Py_DECREF(path_bytes);
3904 if (!output) {
3905 return 0;
3906 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003908 else {
3909 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003910 "path should be string, bytes, or os.PathLike, not %.200s",
3911 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003912 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003913 return 0;
3914 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003915 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003916 Py_DECREF(output);
3917 return 0;
3918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003920 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003921 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003922 Py_DECREF(output);
3923 return 0;
3924 }
3925 *(PyObject**)addr = output;
3926 return Py_CLEANUP_SUPPORTED;
3927}
3928
3929
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003930const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003932{
Christian Heimesf3863112007-11-22 07:46:41 +00003933 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003935 if (!PyUnicode_Check(unicode)) {
3936 PyErr_BadArgument();
3937 return NULL;
3938 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003940 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003942 if (PyUnicode_UTF8(unicode) == NULL) {
3943 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003944 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 if (bytes == NULL)
3946 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003947 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3948 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003949 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 Py_DECREF(bytes);
3951 return NULL;
3952 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003954 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 PyBytes_AS_STRING(bytes),
3956 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 Py_DECREF(bytes);
3958 }
3959
3960 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003961 *psize = PyUnicode_UTF8_LENGTH(unicode);
3962 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003963}
3964
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003965const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3969}
3970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971Py_UNICODE *
3972PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 if (!PyUnicode_Check(unicode)) {
3975 PyErr_BadArgument();
3976 return NULL;
3977 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003978 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3979 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003981 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Serhiy Storchakac46db922018-10-23 22:58:24 +03003984 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3985 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3986 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003989 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3990 if (w == NULL) {
3991 PyErr_NoMemory();
3992 return NULL;
3993 }
3994 unicode_copy_as_widechar(unicode, w, wlen + 1);
3995 _PyUnicode_WSTR(unicode) = w;
3996 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3997 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 }
3999 }
4000 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004002 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004003}
4004
Alexander Belopolsky40018472011-02-26 01:02:56 +00004005Py_UNICODE *
4006PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009}
4010
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004011const Py_UNICODE *
4012_PyUnicode_AsUnicode(PyObject *unicode)
4013{
4014 Py_ssize_t size;
4015 const Py_UNICODE *wstr;
4016
4017 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4018 if (wstr && wcslen(wstr) != (size_t)size) {
4019 PyErr_SetString(PyExc_ValueError, "embedded null character");
4020 return NULL;
4021 }
4022 return wstr;
4023}
4024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025
Alexander Belopolsky40018472011-02-26 01:02:56 +00004026Py_ssize_t
4027PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028{
4029 if (!PyUnicode_Check(unicode)) {
4030 PyErr_BadArgument();
4031 goto onError;
4032 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004033 if (_PyUnicode_WSTR(unicode) == NULL) {
4034 if (PyUnicode_AsUnicode(unicode) == NULL)
4035 goto onError;
4036 }
4037 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 return -1;
4041}
4042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043Py_ssize_t
4044PyUnicode_GetLength(PyObject *unicode)
4045{
Victor Stinner07621332012-06-16 04:53:46 +02004046 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 PyErr_BadArgument();
4048 return -1;
4049 }
Victor Stinner07621332012-06-16 04:53:46 +02004050 if (PyUnicode_READY(unicode) == -1)
4051 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 return PyUnicode_GET_LENGTH(unicode);
4053}
4054
4055Py_UCS4
4056PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4057{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004058 void *data;
4059 int kind;
4060
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004061 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004062 PyErr_BadArgument();
4063 return (Py_UCS4)-1;
4064 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004065 if (PyUnicode_READY(unicode) == -1) {
4066 return (Py_UCS4)-1;
4067 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004068 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004069 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 return (Py_UCS4)-1;
4071 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004072 data = PyUnicode_DATA(unicode);
4073 kind = PyUnicode_KIND(unicode);
4074 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075}
4076
4077int
4078PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4079{
4080 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004081 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 return -1;
4083 }
Victor Stinner488fa492011-12-12 00:01:39 +01004084 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004085 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004086 PyErr_SetString(PyExc_IndexError, "string index out of range");
4087 return -1;
4088 }
Victor Stinner488fa492011-12-12 00:01:39 +01004089 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004090 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004091 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4092 PyErr_SetString(PyExc_ValueError, "character out of range");
4093 return -1;
4094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4096 index, ch);
4097 return 0;
4098}
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100const char *
4101PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004102{
Victor Stinner42cb4622010-09-01 19:39:01 +00004103 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004104}
4105
Victor Stinner554f3f02010-06-16 23:33:54 +00004106/* create or adjust a UnicodeDecodeError */
4107static void
4108make_decode_exception(PyObject **exceptionObject,
4109 const char *encoding,
4110 const char *input, Py_ssize_t length,
4111 Py_ssize_t startpos, Py_ssize_t endpos,
4112 const char *reason)
4113{
4114 if (*exceptionObject == NULL) {
4115 *exceptionObject = PyUnicodeDecodeError_Create(
4116 encoding, input, length, startpos, endpos, reason);
4117 }
4118 else {
4119 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4120 goto onError;
4121 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4122 goto onError;
4123 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4124 goto onError;
4125 }
4126 return;
4127
4128onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004129 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004130}
4131
Steve Dowercc16be82016-09-08 10:35:16 -07004132#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004133static int
4134widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4135{
4136 if (newsize > *size) {
4137 wchar_t *newbuf = *buf;
4138 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4139 PyErr_NoMemory();
4140 return -1;
4141 }
4142 *buf = newbuf;
4143 }
4144 *size = newsize;
4145 return 0;
4146}
4147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148/* error handling callback helper:
4149 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004150 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 and adjust various state variables.
4152 return 0 on success, -1 on error
4153*/
4154
Alexander Belopolsky40018472011-02-26 01:02:56 +00004155static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004156unicode_decode_call_errorhandler_wchar(
4157 const char *errors, PyObject **errorHandler,
4158 const char *encoding, const char *reason,
4159 const char **input, const char **inend, Py_ssize_t *startinpos,
4160 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004161 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004163 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164
4165 PyObject *restuple = NULL;
4166 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004167 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 Py_ssize_t requiredsize;
4170 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004171 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 wchar_t *repwstr;
4173 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174
4175 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 *errorHandler = PyCodec_LookupError(errors);
4177 if (*errorHandler == NULL)
4178 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 }
4180
Victor Stinner554f3f02010-06-16 23:33:54 +00004181 make_decode_exception(exceptionObject,
4182 encoding,
4183 *input, *inend - *input,
4184 *startinpos, *endinpos,
4185 reason);
4186 if (*exceptionObject == NULL)
4187 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004189 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004193 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004196 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198
4199 /* Copy back the bytes variables, which might have been modified by the
4200 callback */
4201 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4202 if (!inputobj)
4203 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204 *input = PyBytes_AS_STRING(inputobj);
4205 insize = PyBytes_GET_SIZE(inputobj);
4206 *inend = *input + insize;
4207 /* we can DECREF safely, as the exception has another reference,
4208 so the object won't go away. */
4209 Py_DECREF(inputobj);
4210
4211 if (newpos<0)
4212 newpos = insize+newpos;
4213 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004214 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 goto onError;
4216 }
4217
4218 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4219 if (repwstr == NULL)
4220 goto onError;
4221 /* need more space? (at least enough for what we
4222 have+the replacement+the rest of the string (starting
4223 at the new input position), so we won't have to check space
4224 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004225 requiredsize = *outpos;
4226 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4227 goto overflow;
4228 requiredsize += repwlen;
4229 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4230 goto overflow;
4231 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004232 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004234 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004240 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 *endinpos = newpos;
4243 *inptr = *input + newpos;
4244
4245 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004246 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247 return 0;
4248
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004249 overflow:
4250 PyErr_SetString(PyExc_OverflowError,
4251 "decoded result is too long for a Python string");
4252
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253 onError:
4254 Py_XDECREF(restuple);
4255 return -1;
4256}
Steve Dowercc16be82016-09-08 10:35:16 -07004257#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258
4259static int
4260unicode_decode_call_errorhandler_writer(
4261 const char *errors, PyObject **errorHandler,
4262 const char *encoding, const char *reason,
4263 const char **input, const char **inend, Py_ssize_t *startinpos,
4264 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4265 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4266{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004267 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268
4269 PyObject *restuple = NULL;
4270 PyObject *repunicode = NULL;
4271 Py_ssize_t insize;
4272 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004273 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004274 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004276 int need_to_grow = 0;
4277 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004278
4279 if (*errorHandler == NULL) {
4280 *errorHandler = PyCodec_LookupError(errors);
4281 if (*errorHandler == NULL)
4282 goto onError;
4283 }
4284
4285 make_decode_exception(exceptionObject,
4286 encoding,
4287 *input, *inend - *input,
4288 *startinpos, *endinpos,
4289 reason);
4290 if (*exceptionObject == NULL)
4291 goto onError;
4292
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004293 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294 if (restuple == NULL)
4295 goto onError;
4296 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004297 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 goto onError;
4299 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004300 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004301 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004302
4303 /* Copy back the bytes variables, which might have been modified by the
4304 callback */
4305 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4306 if (!inputobj)
4307 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004308 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004309 *input = PyBytes_AS_STRING(inputobj);
4310 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004311 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004312 /* we can DECREF safely, as the exception has another reference,
4313 so the object won't go away. */
4314 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004318 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004319 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322
Victor Stinner170ca6f2013-04-18 00:25:28 +02004323 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004324 if (replen > 1) {
4325 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004326 need_to_grow = 1;
4327 }
4328 new_inptr = *input + newpos;
4329 if (*inend - new_inptr > remain) {
4330 /* We don't know the decoding algorithm here so we make the worst
4331 assumption that one byte decodes to one unicode character.
4332 If unfortunately one byte could decode to more unicode characters,
4333 the decoder may write out-of-bound then. Is it possible for the
4334 algorithms using this function? */
4335 writer->min_length += *inend - new_inptr - remain;
4336 need_to_grow = 1;
4337 }
4338 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004339 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004340 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004341 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4342 goto onError;
4343 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004345 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004348 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004351 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357}
4358
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359/* --- UTF-7 Codec -------------------------------------------------------- */
4360
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361/* See RFC2152 for details. We encode conservatively and decode liberally. */
4362
4363/* Three simple macros defining base-64. */
4364
4365/* Is c a base-64 character? */
4366
4367#define IS_BASE64(c) \
4368 (((c) >= 'A' && (c) <= 'Z') || \
4369 ((c) >= 'a' && (c) <= 'z') || \
4370 ((c) >= '0' && (c) <= '9') || \
4371 (c) == '+' || (c) == '/')
4372
4373/* given that c is a base-64 character, what is its base-64 value? */
4374
4375#define FROM_BASE64(c) \
4376 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4377 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4378 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4379 (c) == '+' ? 62 : 63)
4380
4381/* What is the base-64 character of the bottom 6 bits of n? */
4382
4383#define TO_BASE64(n) \
4384 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4385
4386/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4387 * decoded as itself. We are permissive on decoding; the only ASCII
4388 * byte not decoding to itself is the + which begins a base64
4389 * string. */
4390
4391#define DECODE_DIRECT(c) \
4392 ((c) <= 127 && (c) != '+')
4393
4394/* The UTF-7 encoder treats ASCII characters differently according to
4395 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4396 * the above). See RFC2152. This array identifies these different
4397 * sets:
4398 * 0 : "Set D"
4399 * alphanumeric and '(),-./:?
4400 * 1 : "Set O"
4401 * !"#$%&*;<=>@[]^_`{|}
4402 * 2 : "whitespace"
4403 * ht nl cr sp
4404 * 3 : special (must be base64 encoded)
4405 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4406 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004407
Tim Petersced69f82003-09-16 20:30:58 +00004408static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409char utf7_category[128] = {
4410/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4411 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4412/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4413 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4414/* sp ! " # $ % & ' ( ) * + , - . / */
4415 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4416/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4418/* @ A B C D E F G H I J K L M N O */
4419 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4420/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4422/* ` a b c d e f g h i j k l m n o */
4423 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4424/* p q r s t u v w x y z { | } ~ del */
4425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426};
4427
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428/* ENCODE_DIRECT: this character should be encoded as itself. The
4429 * answer depends on whether we are encoding set O as itself, and also
4430 * on whether we are encoding whitespace as itself. RFC2152 makes it
4431 * clear that the answers to these questions vary between
4432 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004433
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434#define ENCODE_DIRECT(c, directO, directWS) \
4435 ((c) < 128 && (c) > 0 && \
4436 ((utf7_category[(c)] == 0) || \
4437 (directWS && (utf7_category[(c)] == 2)) || \
4438 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439
Alexander Belopolsky40018472011-02-26 01:02:56 +00004440PyObject *
4441PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004442 Py_ssize_t size,
4443 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004445 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4446}
4447
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448/* The decoder. The only state we preserve is our read position,
4449 * i.e. how many characters we have consumed. So if we end in the
4450 * middle of a shift sequence we have to back off the read position
4451 * and the output to the beginning of the sequence, otherwise we lose
4452 * all the shift state (seen bits, number of bits seen, high
4453 * surrogate). */
4454
Alexander Belopolsky40018472011-02-26 01:02:56 +00004455PyObject *
4456PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004457 Py_ssize_t size,
4458 const char *errors,
4459 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004462 Py_ssize_t startinpos;
4463 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 const char *errmsg = "";
4467 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 unsigned int base64bits = 0;
4470 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004471 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 PyObject *errorHandler = NULL;
4473 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 if (size == 0) {
4476 if (consumed)
4477 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004478 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004479 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004481 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004482 _PyUnicodeWriter_Init(&writer);
4483 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484
4485 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 e = s + size;
4487
4488 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004491 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 if (inShift) { /* in a base-64 section */
4494 if (IS_BASE64(ch)) { /* consume a base-64 character */
4495 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4496 base64bits += 6;
4497 s++;
4498 if (base64bits >= 16) {
4499 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004500 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 base64bits -= 16;
4502 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004503 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 if (surrogate) {
4505 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004506 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4507 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004508 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004509 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004511 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 }
4513 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004514 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004515 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 }
4518 }
Victor Stinner551ac952011-11-29 22:58:13 +01004519 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 /* first surrogate */
4521 surrogate = outCh;
4522 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004524 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004525 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 }
4527 }
4528 }
4529 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 if (base64bits > 0) { /* left-over bits */
4532 if (base64bits >= 6) {
4533 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004534 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 errmsg = "partial character in shift sequence";
4536 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 else {
4539 /* Some bits remain; they should be zero */
4540 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 errmsg = "non-zero padding bits in shift sequence";
4543 goto utf7Error;
4544 }
4545 }
4546 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004547 if (surrogate && DECODE_DIRECT(ch)) {
4548 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4549 goto onError;
4550 }
4551 surrogate = 0;
4552 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 /* '-' is absorbed; other terminating
4554 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004555 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557 }
4558 }
4559 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 s++; /* consume '+' */
4562 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004564 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004565 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004567 else if (s < e && !IS_BASE64(*s)) {
4568 s++;
4569 errmsg = "ill-formed sequence";
4570 goto utf7Error;
4571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004574 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004577 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 }
4579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004582 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 else {
4586 startinpos = s-starts;
4587 s++;
4588 errmsg = "unexpected special character";
4589 goto utf7Error;
4590 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004594 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 errors, &errorHandler,
4596 "utf7", errmsg,
4597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004598 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600 }
4601
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 /* end of string */
4603
4604 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4605 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004606 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 if (surrogate ||
4608 (base64bits >= 6) ||
4609 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004611 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 errors, &errorHandler,
4613 "utf7", "unterminated shift sequence",
4614 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004615 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 goto onError;
4617 if (s < e)
4618 goto restart;
4619 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621
4622 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004623 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004625 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004626 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004627 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004628 writer.kind, writer.data, shiftOutStart);
4629 Py_XDECREF(errorHandler);
4630 Py_XDECREF(exc);
4631 _PyUnicodeWriter_Dealloc(&writer);
4632 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004633 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004634 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 }
4636 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004637 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004639 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 Py_XDECREF(errorHandler);
4642 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004643 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 Py_XDECREF(errorHandler);
4647 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 return NULL;
4650}
4651
4652
Alexander Belopolsky40018472011-02-26 01:02:56 +00004653PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004654_PyUnicode_EncodeUTF7(PyObject *str,
4655 int base64SetO,
4656 int base64WhiteSpace,
4657 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004659 int kind;
4660 void *data;
4661 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004662 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004664 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 unsigned int base64bits = 0;
4666 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 char * out;
4668 char * start;
4669
Benjamin Petersonbac79492012-01-14 13:34:47 -05004670 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004671 return NULL;
4672 kind = PyUnicode_KIND(str);
4673 data = PyUnicode_DATA(str);
4674 len = PyUnicode_GET_LENGTH(str);
4675
4676 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004679 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004680 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004681 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004682 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 if (v == NULL)
4684 return NULL;
4685
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004686 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004687 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004688 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 if (inShift) {
4691 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4692 /* shifting out */
4693 if (base64bits) { /* output remaining bits */
4694 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4695 base64buffer = 0;
4696 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697 }
4698 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 /* Characters not in the BASE64 set implicitly unshift the sequence
4700 so no '-' is required, except if the character is itself a '-' */
4701 if (IS_BASE64(ch) || ch == '-') {
4702 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 *out++ = (char) ch;
4705 }
4706 else {
4707 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 else { /* not in a shift sequence */
4711 if (ch == '+') {
4712 *out++ = '+';
4713 *out++ = '-';
4714 }
4715 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4716 *out++ = (char) ch;
4717 }
4718 else {
4719 *out++ = '+';
4720 inShift = 1;
4721 goto encode_char;
4722 }
4723 }
4724 continue;
4725encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004727 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004728
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 /* code first surrogate */
4730 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004731 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 while (base64bits >= 6) {
4733 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4734 base64bits -= 6;
4735 }
4736 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004737 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004739 base64bits += 16;
4740 base64buffer = (base64buffer << 16) | ch;
4741 while (base64bits >= 6) {
4742 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4743 base64bits -= 6;
4744 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 if (base64bits)
4747 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4748 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 if (_PyBytes_Resize(&v, out - start) < 0)
4751 return NULL;
4752 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754PyObject *
4755PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4756 Py_ssize_t size,
4757 int base64SetO,
4758 int base64WhiteSpace,
4759 const char *errors)
4760{
4761 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004762 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 if (tmp == NULL)
4764 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004765 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 base64WhiteSpace, errors);
4767 Py_DECREF(tmp);
4768 return result;
4769}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771#undef IS_BASE64
4772#undef FROM_BASE64
4773#undef TO_BASE64
4774#undef DECODE_DIRECT
4775#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777/* --- UTF-8 Codec -------------------------------------------------------- */
4778
Alexander Belopolsky40018472011-02-26 01:02:56 +00004779PyObject *
4780PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004781 Py_ssize_t size,
4782 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Walter Dörwald69652032004-09-07 20:24:22 +00004784 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4785}
4786
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787#include "stringlib/asciilib.h"
4788#include "stringlib/codecs.h"
4789#include "stringlib/undef.h"
4790
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004791#include "stringlib/ucs1lib.h"
4792#include "stringlib/codecs.h"
4793#include "stringlib/undef.h"
4794
4795#include "stringlib/ucs2lib.h"
4796#include "stringlib/codecs.h"
4797#include "stringlib/undef.h"
4798
4799#include "stringlib/ucs4lib.h"
4800#include "stringlib/codecs.h"
4801#include "stringlib/undef.h"
4802
Antoine Pitrouab868312009-01-10 15:40:25 +00004803/* Mask to quickly check whether a C 'long' contains a
4804 non-ASCII, UTF8-encoded char. */
4805#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004806# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004807#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004808# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004809#else
4810# error C 'long' size should be either 4 or 8!
4811#endif
4812
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813static Py_ssize_t
4814ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004817 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004819 /*
4820 * Issue #17237: m68k is a bit different from most architectures in
4821 * that objects do not use "natural alignment" - for example, int and
4822 * long are only aligned at 2-byte boundaries. Therefore the assert()
4823 * won't work; also, tests have shown that skipping the "optimised
4824 * version" will even speed up m68k.
4825 */
4826#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004828 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4829 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 /* Fast path, see in STRINGLIB(utf8_decode) for
4831 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004832 /* Help allocation */
4833 const char *_p = p;
4834 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 while (_p < aligned_end) {
4836 unsigned long value = *(const unsigned long *) _p;
4837 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 *((unsigned long *)q) = value;
4840 _p += SIZEOF_LONG;
4841 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004842 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 p = _p;
4844 while (p < end) {
4845 if ((unsigned char)*p & 0x80)
4846 break;
4847 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004852#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 while (p < end) {
4854 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4855 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004856 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004857 /* Help allocation */
4858 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 while (_p < aligned_end) {
4860 unsigned long value = *(unsigned long *) _p;
4861 if (value & ASCII_CHAR_MASK)
4862 break;
4863 _p += SIZEOF_LONG;
4864 }
4865 p = _p;
4866 if (_p == end)
4867 break;
4868 }
4869 if ((unsigned char)*p & 0x80)
4870 break;
4871 ++p;
4872 }
4873 memcpy(dest, start, p - start);
4874 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875}
Antoine Pitrouab868312009-01-10 15:40:25 +00004876
Victor Stinner709d23d2019-05-02 14:56:30 -04004877static PyObject *
4878unicode_decode_utf8(const char *s, Py_ssize_t size,
4879 _Py_error_handler error_handler, const char *errors,
4880 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004881{
Victor Stinner785938e2011-12-11 20:09:03 +01004882 if (size == 0) {
4883 if (consumed)
4884 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004885 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004886 }
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4889 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004890 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 *consumed = 1;
4892 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004893 }
4894
Inada Naoki770847a2019-06-24 12:30:24 +09004895 const char *starts = s;
4896 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004897
Inada Naoki770847a2019-06-24 12:30:24 +09004898 // fast path: try ASCII string.
4899 PyObject *u = PyUnicode_New(size, 127);
4900 if (u == NULL) {
4901 return NULL;
4902 }
4903 s += ascii_decode(s, end, PyUnicode_DATA(u));
4904 if (s == end) {
4905 return u;
4906 }
4907
4908 // Use _PyUnicodeWriter after fast path is failed.
4909 _PyUnicodeWriter writer;
4910 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4911 writer.pos = s - starts;
4912
4913 Py_ssize_t startinpos, endinpos;
4914 const char *errmsg = "";
4915 PyObject *error_handler_obj = NULL;
4916 PyObject *exc = NULL;
4917
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 while (s < end) {
4919 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004921
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 if (PyUnicode_IS_ASCII(writer.buffer))
4924 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004926 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004928 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 } else {
4930 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 }
4933
4934 switch (ch) {
4935 case 0:
4936 if (s == end || consumed)
4937 goto End;
4938 errmsg = "unexpected end of data";
4939 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004940 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 break;
4942 case 1:
4943 errmsg = "invalid start byte";
4944 startinpos = s - starts;
4945 endinpos = startinpos + 1;
4946 break;
4947 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004948 case 3:
4949 case 4:
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004950 if (s == end || consumed) {
4951 goto End;
4952 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953 errmsg = "invalid continuation byte";
4954 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004955 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004956 break;
4957 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004958 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004959 goto onError;
4960 continue;
4961 }
4962
Victor Stinner1d65d912015-10-05 13:43:50 +02004963 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004964 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004965
4966 switch (error_handler) {
4967 case _Py_ERROR_IGNORE:
4968 s += (endinpos - startinpos);
4969 break;
4970
4971 case _Py_ERROR_REPLACE:
4972 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4973 goto onError;
4974 s += (endinpos - startinpos);
4975 break;
4976
4977 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004978 {
4979 Py_ssize_t i;
4980
Victor Stinner1d65d912015-10-05 13:43:50 +02004981 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4982 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004983 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004984 ch = (Py_UCS4)(unsigned char)(starts[i]);
4985 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4986 ch + 0xdc00);
4987 writer.pos++;
4988 }
4989 s += (endinpos - startinpos);
4990 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004991 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004992
4993 default:
4994 if (unicode_decode_call_errorhandler_writer(
4995 errors, &error_handler_obj,
4996 "utf-8", errmsg,
4997 &starts, &end, &startinpos, &endinpos, &exc, &s,
4998 &writer))
4999 goto onError;
5000 }
Victor Stinner785938e2011-12-11 20:09:03 +01005001 }
5002
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 if (consumed)
5005 *consumed = s - starts;
5006
Victor Stinner1d65d912015-10-05 13:43:50 +02005007 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005009 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010
5011onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005012 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005014 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005016}
5017
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005018
Victor Stinner709d23d2019-05-02 14:56:30 -04005019PyObject *
5020PyUnicode_DecodeUTF8Stateful(const char *s,
5021 Py_ssize_t size,
5022 const char *errors,
5023 Py_ssize_t *consumed)
5024{
5025 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5026}
5027
5028
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005029/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5030 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On success, write a pointer to a newly allocated wide character string into
5033 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5034 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005035
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005036 On memory allocation failure, return -1.
5037
5038 On decoding error (if surrogateescape is zero), return -2. If wlen is
5039 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5040 is not NULL, write the decoding error message into *reason. */
5041int
5042_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005043 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005044{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005045 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005046 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005047 wchar_t *unicode;
5048 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005049
Victor Stinner3d4226a2018-08-29 22:21:32 +02005050 int surrogateescape = 0;
5051 int surrogatepass = 0;
5052 switch (errors)
5053 {
5054 case _Py_ERROR_STRICT:
5055 break;
5056 case _Py_ERROR_SURROGATEESCAPE:
5057 surrogateescape = 1;
5058 break;
5059 case _Py_ERROR_SURROGATEPASS:
5060 surrogatepass = 1;
5061 break;
5062 default:
5063 return -3;
5064 }
5065
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005066 /* Note: size will always be longer than the resulting Unicode
5067 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005068 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005069 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005070 }
5071
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005072 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005073 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005074 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005075 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076
5077 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 if (ch > 0xFF) {
5088#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005089 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005091 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005092 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5094 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5095#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005096 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005098 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005100 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005101
5102 if (surrogateescape) {
5103 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5104 }
5105 else {
5106 /* Is it a valid three-byte code? */
5107 if (surrogatepass
5108 && (e - s) >= 3
5109 && (s[0] & 0xf0) == 0xe0
5110 && (s[1] & 0xc0) == 0x80
5111 && (s[2] & 0xc0) == 0x80)
5112 {
5113 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5114 s += 3;
5115 unicode[outpos++] = ch;
5116 }
5117 else {
5118 PyMem_RawFree(unicode );
5119 if (reason != NULL) {
5120 switch (ch) {
5121 case 0:
5122 *reason = "unexpected end of data";
5123 break;
5124 case 1:
5125 *reason = "invalid start byte";
5126 break;
5127 /* 2, 3, 4 */
5128 default:
5129 *reason = "invalid continuation byte";
5130 break;
5131 }
5132 }
5133 if (wlen != NULL) {
5134 *wlen = s - orig_s;
5135 }
5136 return -2;
5137 }
5138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 if (wlen) {
5143 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005144 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145 *wstr = unicode;
5146 return 0;
5147}
5148
Victor Stinner5f9cf232019-03-19 01:46:25 +01005149
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005151_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5152 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005153{
5154 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005155 int res = _Py_DecodeUTF8Ex(arg, arglen,
5156 &wstr, wlen,
5157 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005158 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005159 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5160 assert(res != -3);
5161 if (wlen) {
5162 *wlen = (size_t)res;
5163 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005164 return NULL;
5165 }
5166 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005167}
5168
Antoine Pitrouab868312009-01-10 15:40:25 +00005169
Victor Stinnere47e6982017-12-21 15:45:16 +01005170/* UTF-8 encoder using the surrogateescape error handler .
5171
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005172 On success, return 0 and write the newly allocated character string (use
5173 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005174
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005175 On encoding failure, return -2 and write the position of the invalid
5176 surrogate character into *error_pos (if error_pos is set) and the decoding
5177 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005178
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005179 On memory allocation failure, return -1. */
5180int
5181_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005182 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005183{
5184 const Py_ssize_t max_char_size = 4;
5185 Py_ssize_t len = wcslen(text);
5186
5187 assert(len >= 0);
5188
Victor Stinner3d4226a2018-08-29 22:21:32 +02005189 int surrogateescape = 0;
5190 int surrogatepass = 0;
5191 switch (errors)
5192 {
5193 case _Py_ERROR_STRICT:
5194 break;
5195 case _Py_ERROR_SURROGATEESCAPE:
5196 surrogateescape = 1;
5197 break;
5198 case _Py_ERROR_SURROGATEPASS:
5199 surrogatepass = 1;
5200 break;
5201 default:
5202 return -3;
5203 }
5204
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5206 return -1;
5207 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005208 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209 if (raw_malloc) {
5210 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005211 }
5212 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005213 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005214 }
5215 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005216 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005217 }
5218
5219 char *p = bytes;
5220 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005221 for (i = 0; i < len; ) {
5222 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005223 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005224 i++;
5225#if Py_UNICODE_SIZE == 2
5226 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5227 && i < len
5228 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5229 {
5230 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5231 i++;
5232 }
5233#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005234
5235 if (ch < 0x80) {
5236 /* Encode ASCII */
5237 *p++ = (char) ch;
5238
5239 }
5240 else if (ch < 0x0800) {
5241 /* Encode Latin-1 */
5242 *p++ = (char)(0xc0 | (ch >> 6));
5243 *p++ = (char)(0x80 | (ch & 0x3f));
5244 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005245 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005246 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005247 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005248 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005249 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005250 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005251 if (reason != NULL) {
5252 *reason = "encoding error";
5253 }
5254 if (raw_malloc) {
5255 PyMem_RawFree(bytes);
5256 }
5257 else {
5258 PyMem_Free(bytes);
5259 }
5260 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005261 }
5262 *p++ = (char)(ch & 0xff);
5263 }
5264 else if (ch < 0x10000) {
5265 *p++ = (char)(0xe0 | (ch >> 12));
5266 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5267 *p++ = (char)(0x80 | (ch & 0x3f));
5268 }
5269 else { /* ch >= 0x10000 */
5270 assert(ch <= MAX_UNICODE);
5271 /* Encode UCS4 Unicode ordinals */
5272 *p++ = (char)(0xf0 | (ch >> 18));
5273 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5274 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5275 *p++ = (char)(0x80 | (ch & 0x3f));
5276 }
5277 }
5278 *p++ = '\0';
5279
5280 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005281 char *bytes2;
5282 if (raw_malloc) {
5283 bytes2 = PyMem_RawRealloc(bytes, final_size);
5284 }
5285 else {
5286 bytes2 = PyMem_Realloc(bytes, final_size);
5287 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005288 if (bytes2 == NULL) {
5289 if (error_pos != NULL) {
5290 *error_pos = (size_t)-1;
5291 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005292 if (raw_malloc) {
5293 PyMem_RawFree(bytes);
5294 }
5295 else {
5296 PyMem_Free(bytes);
5297 }
5298 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005299 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005300 *str = bytes2;
5301 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005302}
5303
5304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005305/* Primary internal function which creates utf8 encoded bytes objects.
5306
5307 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005308 and allocate exactly as much space needed at the end. Else allocate the
5309 maximum possible needed (4 result bytes per Unicode character), and return
5310 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005311*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005312static PyObject *
5313unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5314 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315{
Victor Stinner6099a032011-12-18 14:22:26 +01005316 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005317 void *data;
5318 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005320 if (!PyUnicode_Check(unicode)) {
5321 PyErr_BadArgument();
5322 return NULL;
5323 }
5324
5325 if (PyUnicode_READY(unicode) == -1)
5326 return NULL;
5327
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005328 if (PyUnicode_UTF8(unicode))
5329 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5330 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005331
5332 kind = PyUnicode_KIND(unicode);
5333 data = PyUnicode_DATA(unicode);
5334 size = PyUnicode_GET_LENGTH(unicode);
5335
Benjamin Petersonead6b532011-12-20 17:23:42 -06005336 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005337 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005338 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005339 case PyUnicode_1BYTE_KIND:
5340 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5341 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005342 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005343 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005344 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005345 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005346 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348}
5349
Alexander Belopolsky40018472011-02-26 01:02:56 +00005350PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005351_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5352{
5353 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5354}
5355
5356
5357PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005358PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5359 Py_ssize_t size,
5360 const char *errors)
5361{
5362 PyObject *v, *unicode;
5363
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005364 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005365 if (unicode == NULL)
5366 return NULL;
5367 v = _PyUnicode_AsUTF8String(unicode, errors);
5368 Py_DECREF(unicode);
5369 return v;
5370}
5371
5372PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005373PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005375 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376}
5377
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378/* --- UTF-32 Codec ------------------------------------------------------- */
5379
5380PyObject *
5381PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 Py_ssize_t size,
5383 const char *errors,
5384 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005385{
5386 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5387}
5388
5389PyObject *
5390PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 Py_ssize_t size,
5392 const char *errors,
5393 int *byteorder,
5394 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395{
5396 const char *starts = s;
5397 Py_ssize_t startinpos;
5398 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005399 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005400 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005401 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005402 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005403 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005404 PyObject *errorHandler = NULL;
5405 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005406
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407 q = (unsigned char *)s;
5408 e = q + size;
5409
5410 if (byteorder)
5411 bo = *byteorder;
5412
5413 /* Check for BOM marks (U+FEFF) in the input and adjust current
5414 byte order setting accordingly. In native mode, the leading BOM
5415 mark is skipped, in all other modes, it is copied to the output
5416 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005417 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005418 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005419 if (bom == 0x0000FEFF) {
5420 bo = -1;
5421 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005423 else if (bom == 0xFFFE0000) {
5424 bo = 1;
5425 q += 4;
5426 }
5427 if (byteorder)
5428 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005429 }
5430
Victor Stinnere64322e2012-10-30 23:12:47 +01005431 if (q == e) {
5432 if (consumed)
5433 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005434 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005435 }
5436
Victor Stinnere64322e2012-10-30 23:12:47 +01005437#ifdef WORDS_BIGENDIAN
5438 le = bo < 0;
5439#else
5440 le = bo <= 0;
5441#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005442 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005443
Victor Stinner8f674cc2013-04-17 23:02:17 +02005444 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005445 writer.min_length = (e - q + 3) / 4;
5446 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005447 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005448
Victor Stinnere64322e2012-10-30 23:12:47 +01005449 while (1) {
5450 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005451 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005452
Victor Stinnere64322e2012-10-30 23:12:47 +01005453 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005454 enum PyUnicode_Kind kind = writer.kind;
5455 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005456 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005457 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005458 if (le) {
5459 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005460 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005461 if (ch > maxch)
5462 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005463 if (kind != PyUnicode_1BYTE_KIND &&
5464 Py_UNICODE_IS_SURROGATE(ch))
5465 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005466 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005467 q += 4;
5468 } while (q <= last);
5469 }
5470 else {
5471 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005472 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005473 if (ch > maxch)
5474 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005475 if (kind != PyUnicode_1BYTE_KIND &&
5476 Py_UNICODE_IS_SURROGATE(ch))
5477 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005478 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005479 q += 4;
5480 } while (q <= last);
5481 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005482 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 }
5484
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005485 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005486 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 startinpos = ((const char *)q) - starts;
5488 endinpos = startinpos + 4;
5489 }
5490 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005491 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005493 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005495 startinpos = ((const char *)q) - starts;
5496 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005498 else {
5499 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005500 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005501 goto onError;
5502 q += 4;
5503 continue;
5504 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005505 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005506 startinpos = ((const char *)q) - starts;
5507 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005509
5510 /* The remaining input chars are ignored if the callback
5511 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005512 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005514 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005516 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518 }
5519
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522
Walter Dörwald41980ca2007-08-16 21:55:45 +00005523 Py_XDECREF(errorHandler);
5524 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005525 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005526
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005528 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005529 Py_XDECREF(errorHandler);
5530 Py_XDECREF(exc);
5531 return NULL;
5532}
5533
5534PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005535_PyUnicode_EncodeUTF32(PyObject *str,
5536 const char *errors,
5537 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005538{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005539 enum PyUnicode_Kind kind;
5540 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005541 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005542 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005543 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005544#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005545 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005547 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005549 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005550 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005551 PyObject *errorHandler = NULL;
5552 PyObject *exc = NULL;
5553 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005554
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005555 if (!PyUnicode_Check(str)) {
5556 PyErr_BadArgument();
5557 return NULL;
5558 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005559 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005560 return NULL;
5561 kind = PyUnicode_KIND(str);
5562 data = PyUnicode_DATA(str);
5563 len = PyUnicode_GET_LENGTH(str);
5564
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005565 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005566 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005567 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005568 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005569 if (v == NULL)
5570 return NULL;
5571
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 /* output buffer is 4-bytes aligned */
5573 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005574 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005575 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005576 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005577 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005578 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005579
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005580 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005581 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005582 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005583 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005584 else
5585 encoding = "utf-32";
5586
5587 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005588 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5589 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590 }
5591
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005592 pos = 0;
5593 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005594 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005595
5596 if (kind == PyUnicode_2BYTE_KIND) {
5597 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5598 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005599 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005600 else {
5601 assert(kind == PyUnicode_4BYTE_KIND);
5602 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5603 &out, native_ordering);
5604 }
5605 if (pos == len)
5606 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005607
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 rep = unicode_encode_call_errorhandler(
5609 errors, &errorHandler,
5610 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 if (!rep)
5613 goto error;
5614
5615 if (PyBytes_Check(rep)) {
5616 repsize = PyBytes_GET_SIZE(rep);
5617 if (repsize & 3) {
5618 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005619 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005620 "surrogates not allowed");
5621 goto error;
5622 }
5623 moreunits = repsize / 4;
5624 }
5625 else {
5626 assert(PyUnicode_Check(rep));
5627 if (PyUnicode_READY(rep) < 0)
5628 goto error;
5629 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5630 if (!PyUnicode_IS_ASCII(rep)) {
5631 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005632 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005633 "surrogates not allowed");
5634 goto error;
5635 }
5636 }
5637
5638 /* four bytes are reserved for each surrogate */
5639 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005640 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005641 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005642 /* integer overflow */
5643 PyErr_NoMemory();
5644 goto error;
5645 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005646 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005648 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 }
5650
5651 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005652 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005653 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005654 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005655 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005656 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5657 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005658 }
5659
5660 Py_CLEAR(rep);
5661 }
5662
5663 /* Cut back to size actually needed. This is necessary for, for example,
5664 encoding of a string containing isolated surrogates and the 'ignore'
5665 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005666 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 if (nsize != PyBytes_GET_SIZE(v))
5668 _PyBytes_Resize(&v, nsize);
5669 Py_XDECREF(errorHandler);
5670 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005671 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005672 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005673 error:
5674 Py_XDECREF(rep);
5675 Py_XDECREF(errorHandler);
5676 Py_XDECREF(exc);
5677 Py_XDECREF(v);
5678 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005679}
5680
Alexander Belopolsky40018472011-02-26 01:02:56 +00005681PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005682PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5683 Py_ssize_t size,
5684 const char *errors,
5685 int byteorder)
5686{
5687 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005688 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005689 if (tmp == NULL)
5690 return NULL;
5691 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5692 Py_DECREF(tmp);
5693 return result;
5694}
5695
5696PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005697PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698{
Victor Stinnerb960b342011-11-20 19:12:52 +01005699 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005700}
5701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702/* --- UTF-16 Codec ------------------------------------------------------- */
5703
Tim Peters772747b2001-08-09 22:21:55 +00005704PyObject *
5705PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 Py_ssize_t size,
5707 const char *errors,
5708 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709{
Walter Dörwald69652032004-09-07 20:24:22 +00005710 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5711}
5712
5713PyObject *
5714PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 Py_ssize_t size,
5716 const char *errors,
5717 int *byteorder,
5718 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005719{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t startinpos;
5722 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005723 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005724 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005725 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005726 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005727 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 PyObject *errorHandler = NULL;
5729 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731
Tim Peters772747b2001-08-09 22:21:55 +00005732 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005733 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
5735 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005736 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005738 /* Check for BOM marks (U+FEFF) in the input and adjust current
5739 byte order setting accordingly. In native mode, the leading BOM
5740 mark is skipped, in all other modes, it is copied to the output
5741 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005742 if (bo == 0 && size >= 2) {
5743 const Py_UCS4 bom = (q[1] << 8) | q[0];
5744 if (bom == 0xFEFF) {
5745 q += 2;
5746 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005748 else if (bom == 0xFFFE) {
5749 q += 2;
5750 bo = 1;
5751 }
5752 if (byteorder)
5753 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Antoine Pitrou63065d72012-05-15 23:48:04 +02005756 if (q == e) {
5757 if (consumed)
5758 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005759 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005760 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005761
Christian Heimes743e0cd2012-10-17 23:52:17 +02005762#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005763 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005764 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005765#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005766 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005767 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005768#endif
Tim Peters772747b2001-08-09 22:21:55 +00005769
Antoine Pitrou63065d72012-05-15 23:48:04 +02005770 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005771 character count normally. Error handler will take care of
5772 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005773 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005774 writer.min_length = (e - q + 1) / 2;
5775 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005776 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005777
Antoine Pitrou63065d72012-05-15 23:48:04 +02005778 while (1) {
5779 Py_UCS4 ch = 0;
5780 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005781 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005782 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005783 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005784 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 native_ordering);
5787 else
5788 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 native_ordering);
5791 } else if (kind == PyUnicode_2BYTE_KIND) {
5792 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005793 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005794 native_ordering);
5795 } else {
5796 assert(kind == PyUnicode_4BYTE_KIND);
5797 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005800 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802
Antoine Pitrou63065d72012-05-15 23:48:04 +02005803 switch (ch)
5804 {
5805 case 0:
5806 /* remaining byte at the end? (size should be even) */
5807 if (q == e || consumed)
5808 goto End;
5809 errmsg = "truncated data";
5810 startinpos = ((const char *)q) - starts;
5811 endinpos = ((const char *)e) - starts;
5812 break;
5813 /* The remaining input chars are ignored if the callback
5814 chooses to skip the input */
5815 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005816 q -= 2;
5817 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005818 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005819 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005820 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005821 endinpos = ((const char *)e) - starts;
5822 break;
5823 case 2:
5824 errmsg = "illegal encoding";
5825 startinpos = ((const char *)q) - 2 - starts;
5826 endinpos = startinpos + 2;
5827 break;
5828 case 3:
5829 errmsg = "illegal UTF-16 surrogate";
5830 startinpos = ((const char *)q) - 4 - starts;
5831 endinpos = startinpos + 2;
5832 break;
5833 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005834 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 continue;
5837 }
5838
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005839 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005840 errors,
5841 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005842 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005843 &starts,
5844 (const char **)&e,
5845 &startinpos,
5846 &endinpos,
5847 &exc,
5848 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
5852
Antoine Pitrou63065d72012-05-15 23:48:04 +02005853End:
Walter Dörwald69652032004-09-07 20:24:22 +00005854 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 Py_XDECREF(errorHandler);
5864 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 return NULL;
5866}
5867
Tim Peters772747b2001-08-09 22:21:55 +00005868PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005869_PyUnicode_EncodeUTF16(PyObject *str,
5870 const char *errors,
5871 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005873 enum PyUnicode_Kind kind;
5874 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005876 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005877 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005878 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005879#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005880 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005881#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005882 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005883#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005884 const char *encoding;
5885 Py_ssize_t nsize, pos;
5886 PyObject *errorHandler = NULL;
5887 PyObject *exc = NULL;
5888 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005889
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 if (!PyUnicode_Check(str)) {
5891 PyErr_BadArgument();
5892 return NULL;
5893 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005894 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 return NULL;
5896 kind = PyUnicode_KIND(str);
5897 data = PyUnicode_DATA(str);
5898 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005899
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005901 if (kind == PyUnicode_4BYTE_KIND) {
5902 const Py_UCS4 *in = (const Py_UCS4 *)data;
5903 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005904 while (in < end) {
5905 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005907 }
5908 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005909 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005910 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005912 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005913 nsize = len + pairs + (byteorder == 0);
5914 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005915 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005919 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005920 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005921 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005922 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005923 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005924 }
5925 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005926 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005927 }
Tim Peters772747b2001-08-09 22:21:55 +00005928
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005929 if (kind == PyUnicode_1BYTE_KIND) {
5930 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5931 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005932 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005933
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005934 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005936 }
5937 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005938 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005939 }
5940 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005941 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005942 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005943
5944 pos = 0;
5945 while (pos < len) {
5946 Py_ssize_t repsize, moreunits;
5947
5948 if (kind == PyUnicode_2BYTE_KIND) {
5949 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5950 &out, native_ordering);
5951 }
5952 else {
5953 assert(kind == PyUnicode_4BYTE_KIND);
5954 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5955 &out, native_ordering);
5956 }
5957 if (pos == len)
5958 break;
5959
5960 rep = unicode_encode_call_errorhandler(
5961 errors, &errorHandler,
5962 encoding, "surrogates not allowed",
5963 str, &exc, pos, pos + 1, &pos);
5964 if (!rep)
5965 goto error;
5966
5967 if (PyBytes_Check(rep)) {
5968 repsize = PyBytes_GET_SIZE(rep);
5969 if (repsize & 1) {
5970 raise_encode_exception(&exc, encoding,
5971 str, pos - 1, pos,
5972 "surrogates not allowed");
5973 goto error;
5974 }
5975 moreunits = repsize / 2;
5976 }
5977 else {
5978 assert(PyUnicode_Check(rep));
5979 if (PyUnicode_READY(rep) < 0)
5980 goto error;
5981 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5982 if (!PyUnicode_IS_ASCII(rep)) {
5983 raise_encode_exception(&exc, encoding,
5984 str, pos - 1, pos,
5985 "surrogates not allowed");
5986 goto error;
5987 }
5988 }
5989
5990 /* two bytes are reserved for each surrogate */
5991 if (moreunits > 1) {
5992 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005993 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005994 /* integer overflow */
5995 PyErr_NoMemory();
5996 goto error;
5997 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005998 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005999 goto error;
6000 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6001 }
6002
6003 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006004 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006005 out += moreunits;
6006 } else /* rep is unicode */ {
6007 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6008 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6009 &out, native_ordering);
6010 }
6011
6012 Py_CLEAR(rep);
6013 }
6014
6015 /* Cut back to size actually needed. This is necessary for, for example,
6016 encoding of a string containing isolated surrogates and the 'ignore' handler
6017 is used. */
6018 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6019 if (nsize != PyBytes_GET_SIZE(v))
6020 _PyBytes_Resize(&v, nsize);
6021 Py_XDECREF(errorHandler);
6022 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006023 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006025 error:
6026 Py_XDECREF(rep);
6027 Py_XDECREF(errorHandler);
6028 Py_XDECREF(exc);
6029 Py_XDECREF(v);
6030 return NULL;
6031#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032}
6033
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006035PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6036 Py_ssize_t size,
6037 const char *errors,
6038 int byteorder)
6039{
6040 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006041 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042 if (tmp == NULL)
6043 return NULL;
6044 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6045 Py_DECREF(tmp);
6046 return result;
6047}
6048
6049PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006050PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053}
6054
6055/* --- Unicode Escape Codec ----------------------------------------------- */
6056
Fredrik Lundh06d12682001-01-24 07:59:11 +00006057static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006060_PyUnicode_DecodeUnicodeEscape(const char *s,
6061 Py_ssize_t size,
6062 const char *errors,
6063 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006066 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 PyObject *errorHandler = NULL;
6069 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006070
Eric V. Smith42454af2016-10-31 09:22:08 -04006071 // so we can remember if we've seen an invalid escape char or not
6072 *first_invalid_escape = NULL;
6073
Victor Stinner62ec3312016-09-06 17:04:34 -07006074 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006075 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 }
6077 /* Escaped strings will always be longer than the resulting
6078 Unicode string, so we start with size here and then reduce the
6079 length after conversion to the true value.
6080 (but if the error callback returns a long replacement string
6081 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006082 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006083 writer.min_length = size;
6084 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6085 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006086 }
6087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 end = s + size;
6089 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006090 unsigned char c = (unsigned char) *s++;
6091 Py_UCS4 ch;
6092 int count;
6093 Py_ssize_t startinpos;
6094 Py_ssize_t endinpos;
6095 const char *message;
6096
6097#define WRITE_ASCII_CHAR(ch) \
6098 do { \
6099 assert(ch <= 127); \
6100 assert(writer.pos < writer.size); \
6101 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6102 } while(0)
6103
6104#define WRITE_CHAR(ch) \
6105 do { \
6106 if (ch <= writer.maxchar) { \
6107 assert(writer.pos < writer.size); \
6108 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6109 } \
6110 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6111 goto onError; \
6112 } \
6113 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114
6115 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006116 if (c != '\\') {
6117 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 continue;
6119 }
6120
Victor Stinner62ec3312016-09-06 17:04:34 -07006121 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006123 if (s >= end) {
6124 message = "\\ at end of string";
6125 goto error;
6126 }
6127 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006128
Victor Stinner62ec3312016-09-06 17:04:34 -07006129 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006130 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006133 case '\n': continue;
6134 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6135 case '\'': WRITE_ASCII_CHAR('\''); continue;
6136 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6137 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006138 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006139 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6140 case 't': WRITE_ASCII_CHAR('\t'); continue;
6141 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6142 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006143 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006145 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006146 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 case '0': case '1': case '2': case '3':
6150 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006151 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006152 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 ch = (ch<<3) + *s++ - '0';
6154 if (s < end && '0' <= *s && *s <= '7') {
6155 ch = (ch<<3) + *s++ - '0';
6156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 WRITE_CHAR(ch);
6159 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 /* hex escapes */
6162 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006164 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006165 message = "truncated \\xXX escape";
6166 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006170 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006171 message = "truncated \\uXXXX escape";
6172 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006175 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006176 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006177 message = "truncated \\UXXXXXXXX escape";
6178 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006179 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006180 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 ch <<= 4;
6182 if (c >= '0' && c <= '9') {
6183 ch += c - '0';
6184 }
6185 else if (c >= 'a' && c <= 'f') {
6186 ch += c - ('a' - 10);
6187 }
6188 else if (c >= 'A' && c <= 'F') {
6189 ch += c - ('A' - 10);
6190 }
6191 else {
6192 break;
6193 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006194 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006196 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 }
6198
6199 /* when we get here, ch is a 32-bit unicode character */
6200 if (ch > MAX_UNICODE) {
6201 message = "illegal Unicode character";
6202 goto error;
6203 }
6204
6205 WRITE_CHAR(ch);
6206 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006207
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006209 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006210 if (ucnhash_CAPI == NULL) {
6211 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006212 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6213 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 if (ucnhash_CAPI == NULL) {
6215 PyErr_SetString(
6216 PyExc_UnicodeError,
6217 "\\N escapes not supported (can't load unicodedata module)"
6218 );
6219 goto onError;
6220 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006221 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006222
6223 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006224 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 const char *start = ++s;
6226 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006227 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006228 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006229 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 namelen = s - start;
6231 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006232 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006233 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 ch = 0xffffffff; /* in case 'getcode' messes up */
6235 if (namelen <= INT_MAX &&
6236 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6237 &ch, 0)) {
6238 assert(ch <= MAX_UNICODE);
6239 WRITE_CHAR(ch);
6240 continue;
6241 }
6242 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006243 }
6244 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006245 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006246
6247 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006248 if (*first_invalid_escape == NULL) {
6249 *first_invalid_escape = s-1; /* Back up one char, since we've
6250 already incremented s. */
6251 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 WRITE_ASCII_CHAR('\\');
6253 WRITE_CHAR(c);
6254 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006256
6257 error:
6258 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006259 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006260 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006261 errors, &errorHandler,
6262 "unicodeescape", message,
6263 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006264 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006265 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006267 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006268
6269#undef WRITE_ASCII_CHAR
6270#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006272
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006273 Py_XDECREF(errorHandler);
6274 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006275 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006276
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006278 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 Py_XDECREF(errorHandler);
6280 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 return NULL;
6282}
6283
Eric V. Smith42454af2016-10-31 09:22:08 -04006284PyObject *
6285PyUnicode_DecodeUnicodeEscape(const char *s,
6286 Py_ssize_t size,
6287 const char *errors)
6288{
6289 const char *first_invalid_escape;
6290 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6291 &first_invalid_escape);
6292 if (result == NULL)
6293 return NULL;
6294 if (first_invalid_escape != NULL) {
6295 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6296 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006297 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006298 Py_DECREF(result);
6299 return NULL;
6300 }
6301 }
6302 return result;
6303}
6304
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006305/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306
Alexander Belopolsky40018472011-02-26 01:02:56 +00006307PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006308PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006314 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316
Ezio Melottie7f90372012-10-05 03:33:31 +03006317 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006318 escape.
6319
Ezio Melottie7f90372012-10-05 03:33:31 +03006320 For UCS1 strings it's '\xxx', 4 bytes per source character.
6321 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6322 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006323 */
6324
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 if (!PyUnicode_Check(unicode)) {
6326 PyErr_BadArgument();
6327 return NULL;
6328 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 }
Victor Stinner358af132015-10-12 22:36:57 +02006332
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006333 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 if (len == 0) {
6335 return PyBytes_FromStringAndSize(NULL, 0);
6336 }
6337
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006338 kind = PyUnicode_KIND(unicode);
6339 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6341 bytes, and 1 byte characters 4. */
6342 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006343 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 return PyErr_NoMemory();
6345 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006346 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 if (repr == NULL) {
6348 return NULL;
6349 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006350
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006352 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006353 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006354
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 /* U+0000-U+00ff range */
6356 if (ch < 0x100) {
6357 if (ch >= ' ' && ch < 127) {
6358 if (ch != '\\') {
6359 /* Copy printable US ASCII as-is */
6360 *p++ = (char) ch;
6361 }
6362 /* Escape backslashes */
6363 else {
6364 *p++ = '\\';
6365 *p++ = '\\';
6366 }
6367 }
Victor Stinner358af132015-10-12 22:36:57 +02006368
Victor Stinner62ec3312016-09-06 17:04:34 -07006369 /* Map special whitespace to '\t', \n', '\r' */
6370 else if (ch == '\t') {
6371 *p++ = '\\';
6372 *p++ = 't';
6373 }
6374 else if (ch == '\n') {
6375 *p++ = '\\';
6376 *p++ = 'n';
6377 }
6378 else if (ch == '\r') {
6379 *p++ = '\\';
6380 *p++ = 'r';
6381 }
6382
6383 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6384 else {
6385 *p++ = '\\';
6386 *p++ = 'x';
6387 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6388 *p++ = Py_hexdigits[ch & 0x000F];
6389 }
Tim Petersced69f82003-09-16 20:30:58 +00006390 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006391 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006392 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 *p++ = '\\';
6394 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006395 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6396 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6397 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6398 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6401 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006402
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 /* Make sure that the first two digits are zero */
6404 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006405 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 *p++ = 'U';
6407 *p++ = '0';
6408 *p++ = '0';
6409 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6410 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6411 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6412 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6413 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6414 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417
Victor Stinner62ec3312016-09-06 17:04:34 -07006418 assert(p - PyBytes_AS_STRING(repr) > 0);
6419 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6420 return NULL;
6421 }
6422 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423}
6424
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6427 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006429 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006430 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 }
6434
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 result = PyUnicode_AsUnicodeEscapeString(tmp);
6436 Py_DECREF(tmp);
6437 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438}
6439
6440/* --- Raw Unicode Escape Codec ------------------------------------------- */
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442PyObject *
6443PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006444 Py_ssize_t size,
6445 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450 PyObject *errorHandler = NULL;
6451 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006452
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006454 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006455 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 /* Escaped strings will always be longer than the resulting
6458 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 length after conversion to the true value. (But decoding error
6460 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006461 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006462 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006463 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6464 goto onError;
6465 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006466
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 end = s + size;
6468 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 unsigned char c = (unsigned char) *s++;
6470 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006471 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 Py_ssize_t startinpos;
6473 Py_ssize_t endinpos;
6474 const char *message;
6475
6476#define WRITE_CHAR(ch) \
6477 do { \
6478 if (ch <= writer.maxchar) { \
6479 assert(writer.pos < writer.size); \
6480 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6481 } \
6482 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6483 goto onError; \
6484 } \
6485 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006488 if (c != '\\' || s >= end) {
6489 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006491 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006492
Victor Stinner62ec3312016-09-06 17:04:34 -07006493 c = (unsigned char) *s++;
6494 if (c == 'u') {
6495 count = 4;
6496 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006498 else if (c == 'U') {
6499 count = 8;
6500 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006501 }
6502 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006503 assert(writer.pos < writer.size);
6504 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6505 WRITE_CHAR(c);
6506 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006507 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 startinpos = s - starts - 2;
6509
6510 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6511 for (ch = 0; count && s < end; ++s, --count) {
6512 c = (unsigned char)*s;
6513 ch <<= 4;
6514 if (c >= '0' && c <= '9') {
6515 ch += c - '0';
6516 }
6517 else if (c >= 'a' && c <= 'f') {
6518 ch += c - ('a' - 10);
6519 }
6520 else if (c >= 'A' && c <= 'F') {
6521 ch += c - ('A' - 10);
6522 }
6523 else {
6524 break;
6525 }
6526 }
6527 if (!count) {
6528 if (ch <= MAX_UNICODE) {
6529 WRITE_CHAR(ch);
6530 continue;
6531 }
6532 message = "\\Uxxxxxxxx out of range";
6533 }
6534
6535 endinpos = s-starts;
6536 writer.min_length = end - s + writer.pos;
6537 if (unicode_decode_call_errorhandler_writer(
6538 errors, &errorHandler,
6539 "rawunicodeescape", message,
6540 &starts, &end, &startinpos, &endinpos, &exc, &s,
6541 &writer)) {
6542 goto onError;
6543 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006544 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006545
6546#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006548 Py_XDECREF(errorHandler);
6549 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006550 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006551
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006553 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554 Py_XDECREF(errorHandler);
6555 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006557
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558}
6559
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006562PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563{
Victor Stinner62ec3312016-09-06 17:04:34 -07006564 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006567 int kind;
6568 void *data;
6569 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006571 if (!PyUnicode_Check(unicode)) {
6572 PyErr_BadArgument();
6573 return NULL;
6574 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006575 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006576 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006578 kind = PyUnicode_KIND(unicode);
6579 data = PyUnicode_DATA(unicode);
6580 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 if (kind == PyUnicode_1BYTE_KIND) {
6582 return PyBytes_FromStringAndSize(data, len);
6583 }
Victor Stinner0e368262011-11-10 20:12:49 +01006584
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6586 bytes, and 1 byte characters 4. */
6587 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006588
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 if (len > PY_SSIZE_T_MAX / expandsize) {
6590 return PyErr_NoMemory();
6591 }
6592 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6593 if (repr == NULL) {
6594 return NULL;
6595 }
6596 if (len == 0) {
6597 return repr;
6598 }
6599
6600 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006601 for (pos = 0; pos < len; pos++) {
6602 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006603
Victor Stinner62ec3312016-09-06 17:04:34 -07006604 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6605 if (ch < 0x100) {
6606 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006607 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006608 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 *p++ = '\\';
6611 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006612 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6613 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6614 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6615 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006617 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6618 else {
6619 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6620 *p++ = '\\';
6621 *p++ = 'U';
6622 *p++ = '0';
6623 *p++ = '0';
6624 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6625 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6626 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6627 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6628 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6629 *p++ = Py_hexdigits[ch & 15];
6630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006632
Victor Stinner62ec3312016-09-06 17:04:34 -07006633 assert(p > PyBytes_AS_STRING(repr));
6634 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6635 return NULL;
6636 }
6637 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638}
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006641PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6642 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006645 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006647 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006648 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6649 Py_DECREF(tmp);
6650 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651}
6652
6653/* --- Latin-1 Codec ------------------------------------------------------ */
6654
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655PyObject *
6656PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 Py_ssize_t size,
6658 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006661 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006665static void
6666make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006667 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006668 PyObject *unicode,
6669 Py_ssize_t startpos, Py_ssize_t endpos,
6670 const char *reason)
6671{
6672 if (*exceptionObject == NULL) {
6673 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006675 encoding, unicode, startpos, endpos, reason);
6676 }
6677 else {
6678 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6679 goto onError;
6680 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6681 goto onError;
6682 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6683 goto onError;
6684 return;
6685 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006686 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006687 }
6688}
6689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006691static void
6692raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006693 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006694 PyObject *unicode,
6695 Py_ssize_t startpos, Py_ssize_t endpos,
6696 const char *reason)
6697{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006698 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006699 encoding, unicode, startpos, endpos, reason);
6700 if (*exceptionObject != NULL)
6701 PyCodec_StrictErrors(*exceptionObject);
6702}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703
6704/* error handling callback helper:
6705 build arguments, call the callback and check the arguments,
6706 put the result into newpos and return the replacement string, which
6707 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006708static PyObject *
6709unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006710 PyObject **errorHandler,
6711 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006712 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006713 Py_ssize_t startpos, Py_ssize_t endpos,
6714 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006716 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006717 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 PyObject *restuple;
6719 PyObject *resunicode;
6720
6721 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 }
6726
Benjamin Petersonbac79492012-01-14 13:34:47 -05006727 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 return NULL;
6729 len = PyUnicode_GET_LENGTH(unicode);
6730
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006731 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006736 restuple = PyObject_CallFunctionObjArgs(
6737 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006741 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 Py_DECREF(restuple);
6743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006745 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 &resunicode, newpos)) {
6747 Py_DECREF(restuple);
6748 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006749 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006750 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6751 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6752 Py_DECREF(restuple);
6753 return NULL;
6754 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006755 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756 *newpos = len + *newpos;
6757 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006758 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 Py_DECREF(restuple);
6760 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006761 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 Py_INCREF(resunicode);
6763 Py_DECREF(restuple);
6764 return resunicode;
6765}
6766
Alexander Belopolsky40018472011-02-26 01:02:56 +00006767static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006769 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006770 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772 /* input state */
6773 Py_ssize_t pos=0, size;
6774 int kind;
6775 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 /* pointer into the output */
6777 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006778 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6779 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006780 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006782 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006783 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006784 /* output object */
6785 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786
Benjamin Petersonbac79492012-01-14 13:34:47 -05006787 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 return NULL;
6789 size = PyUnicode_GET_LENGTH(unicode);
6790 kind = PyUnicode_KIND(unicode);
6791 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006792 /* allocate enough for a simple encoding without
6793 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006794 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006795 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006796
6797 _PyBytesWriter_Init(&writer);
6798 str = _PyBytesWriter_Alloc(&writer, size);
6799 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006800 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006802 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006803 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006806 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006808 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006809 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006810 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006812 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006814 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006815 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006817
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006818 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006820
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006821 /* Only overallocate the buffer if it's not the last write */
6822 writer.overallocate = (collend < size);
6823
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006825 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006826 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006827
6828 switch (error_handler) {
6829 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006830 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006832
6833 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006834 memset(str, '?', collend - collstart);
6835 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006836 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006837 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006838 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 break;
Victor Stinner50149202015-09-22 00:26:54 +02006840
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006841 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006842 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006843 writer.min_size -= (collend - collstart);
6844 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006845 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006846 if (str == NULL)
6847 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006848 pos = collend;
6849 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006850
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006851 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006852 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006853 writer.min_size -= (collend - collstart);
6854 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006855 unicode, collstart, collend);
6856 if (str == NULL)
6857 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006858 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 break;
Victor Stinner50149202015-09-22 00:26:54 +02006860
Victor Stinnerc3713e92015-09-29 12:32:13 +02006861 case _Py_ERROR_SURROGATEESCAPE:
6862 for (i = collstart; i < collend; ++i) {
6863 ch = PyUnicode_READ(kind, data, i);
6864 if (ch < 0xdc80 || 0xdcff < ch) {
6865 /* Not a UTF-8b surrogate */
6866 break;
6867 }
6868 *str++ = (char)(ch - 0xdc00);
6869 ++pos;
6870 }
6871 if (i >= collend)
6872 break;
6873 collstart = pos;
6874 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006875 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006876
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006878 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6879 encoding, reason, unicode, &exc,
6880 collstart, collend, &newpos);
6881 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006883
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006884 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006885 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006886
Victor Stinner6bd525b2015-10-09 13:10:05 +02006887 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006888 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006889 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006890 PyBytes_AS_STRING(rep),
6891 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006892 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006893 else {
6894 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006895
Victor Stinner6bd525b2015-10-09 13:10:05 +02006896 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006897 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006898
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006899 if (limit == 256 ?
6900 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6901 !PyUnicode_IS_ASCII(rep))
6902 {
6903 /* Not all characters are smaller than limit */
6904 raise_encode_exception(&exc, encoding, unicode,
6905 collstart, collend, reason);
6906 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006908 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6909 str = _PyBytesWriter_WriteBytes(&writer, str,
6910 PyUnicode_DATA(rep),
6911 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006913 if (str == NULL)
6914 goto onError;
6915
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006916 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006917 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006918 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006919
6920 /* If overallocation was disabled, ensure that it was the last
6921 write. Otherwise, we missed an optimization */
6922 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923 }
6924 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006925
Victor Stinner50149202015-09-22 00:26:54 +02006926 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006927 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006928 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006929
6930 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006931 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006932 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006933 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006934 Py_XDECREF(exc);
6935 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006936}
6937
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006938/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939PyObject *
6940PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006941 Py_ssize_t size,
6942 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006944 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006945 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006946 if (unicode == NULL)
6947 return NULL;
6948 result = unicode_encode_ucs1(unicode, errors, 256);
6949 Py_DECREF(unicode);
6950 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951}
6952
Alexander Belopolsky40018472011-02-26 01:02:56 +00006953PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006954_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955{
6956 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 PyErr_BadArgument();
6958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006960 if (PyUnicode_READY(unicode) == -1)
6961 return NULL;
6962 /* Fast path: if it is a one-byte string, construct
6963 bytes object directly. */
6964 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6965 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6966 PyUnicode_GET_LENGTH(unicode));
6967 /* Non-Latin-1 characters present. Defer to above function to
6968 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006969 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006970}
6971
6972PyObject*
6973PyUnicode_AsLatin1String(PyObject *unicode)
6974{
6975 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976}
6977
6978/* --- 7-bit ASCII Codec -------------------------------------------------- */
6979
Alexander Belopolsky40018472011-02-26 01:02:56 +00006980PyObject *
6981PyUnicode_DecodeASCII(const char *s,
6982 Py_ssize_t size,
6983 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09006986 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006989 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006992 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006993
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006995 if (size == 1 && (unsigned char)s[0] < 128)
6996 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006997
Inada Naoki770847a2019-06-24 12:30:24 +09006998 // Shortcut for simple case
6999 PyObject *u = PyUnicode_New(size, 127);
7000 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007001 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007002 }
7003 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7004 if (outpos == size) {
7005 return u;
7006 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007007
Inada Naoki770847a2019-06-24 12:30:24 +09007008 _PyUnicodeWriter writer;
7009 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007010 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007011
Inada Naoki770847a2019-06-24 12:30:24 +09007012 s += outpos;
7013 int kind = writer.kind;
7014 void *data = writer.data;
7015 Py_ssize_t startinpos, endinpos;
7016
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007017 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007018 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007020 PyUnicode_WRITE(kind, data, writer.pos, c);
7021 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007025
7026 /* byte outsize range 0x00..0x7f: call the error handler */
7027
7028 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007029 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007030
7031 switch (error_handler)
7032 {
7033 case _Py_ERROR_REPLACE:
7034 case _Py_ERROR_SURROGATEESCAPE:
7035 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007036 but we may switch to UCS2 at the first write */
7037 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7038 goto onError;
7039 kind = writer.kind;
7040 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007041
7042 if (error_handler == _Py_ERROR_REPLACE)
7043 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7044 else
7045 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7046 writer.pos++;
7047 ++s;
7048 break;
7049
7050 case _Py_ERROR_IGNORE:
7051 ++s;
7052 break;
7053
7054 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 startinpos = s-starts;
7056 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007057 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007058 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 "ascii", "ordinal not in range(128)",
7060 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007061 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007063 kind = writer.kind;
7064 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007067 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007068 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007069 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007070
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007072 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007073 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 return NULL;
7076}
7077
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007078/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007079PyObject *
7080PyUnicode_EncodeASCII(const Py_UNICODE *p,
7081 Py_ssize_t size,
7082 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007084 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007085 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007086 if (unicode == NULL)
7087 return NULL;
7088 result = unicode_encode_ucs1(unicode, errors, 128);
7089 Py_DECREF(unicode);
7090 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091}
7092
Alexander Belopolsky40018472011-02-26 01:02:56 +00007093PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007094_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095{
7096 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 PyErr_BadArgument();
7098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007100 if (PyUnicode_READY(unicode) == -1)
7101 return NULL;
7102 /* Fast path: if it is an ASCII-only string, construct bytes object
7103 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007104 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007105 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7106 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007107 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007108}
7109
7110PyObject *
7111PyUnicode_AsASCIIString(PyObject *unicode)
7112{
7113 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114}
7115
Steve Dowercc16be82016-09-08 10:35:16 -07007116#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007117
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007118/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007119
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007120#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121#define NEED_RETRY
7122#endif
7123
Victor Stinner3a50e702011-10-18 21:21:00 +02007124#ifndef WC_ERR_INVALID_CHARS
7125# define WC_ERR_INVALID_CHARS 0x0080
7126#endif
7127
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007128static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007129code_page_name(UINT code_page, PyObject **obj)
7130{
7131 *obj = NULL;
7132 if (code_page == CP_ACP)
7133 return "mbcs";
7134 if (code_page == CP_UTF7)
7135 return "CP_UTF7";
7136 if (code_page == CP_UTF8)
7137 return "CP_UTF8";
7138
7139 *obj = PyBytes_FromFormat("cp%u", code_page);
7140 if (*obj == NULL)
7141 return NULL;
7142 return PyBytes_AS_STRING(*obj);
7143}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144
Victor Stinner3a50e702011-10-18 21:21:00 +02007145static DWORD
7146decode_code_page_flags(UINT code_page)
7147{
7148 if (code_page == CP_UTF7) {
7149 /* The CP_UTF7 decoder only supports flags=0 */
7150 return 0;
7151 }
7152 else
7153 return MB_ERR_INVALID_CHARS;
7154}
7155
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 * Decode a byte string from a Windows code page into unicode object in strict
7158 * mode.
7159 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007160 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7161 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007164decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007165 wchar_t **buf,
7166 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 const char *in,
7168 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007169{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007170 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007171 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173
7174 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007176 while ((outsize = MultiByteToWideChar(code_page, flags,
7177 in, insize, NULL, 0)) <= 0)
7178 {
7179 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7180 goto error;
7181 }
7182 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7183 flags = 0;
7184 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007186 /* Extend a wchar_t* buffer */
7187 Py_ssize_t n = *bufsize; /* Get the current length */
7188 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7189 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007191 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007192
7193 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7195 if (outsize <= 0)
7196 goto error;
7197 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199error:
7200 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7201 return -2;
7202 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007203 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204}
7205
Victor Stinner3a50e702011-10-18 21:21:00 +02007206/*
7207 * Decode a byte string from a code page into unicode object with an error
7208 * handler.
7209 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007210 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 * UnicodeDecodeError exception and returns -1 on error.
7212 */
7213static int
7214decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007215 wchar_t **buf,
7216 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007217 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007218 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007219{
7220 const char *startin = in;
7221 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007222 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 /* Ideally, we should get reason from FormatMessage. This is the Windows
7224 2000 English version of the message. */
7225 const char *reason = "No mapping for the Unicode character exists "
7226 "in the target code page.";
7227 /* each step cannot decode more than 1 character, but a character can be
7228 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007229 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007230 int insize;
7231 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 PyObject *errorHandler = NULL;
7233 PyObject *exc = NULL;
7234 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007235 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 DWORD err;
7237 int ret = -1;
7238
7239 assert(size > 0);
7240
7241 encoding = code_page_name(code_page, &encoding_obj);
7242 if (encoding == NULL)
7243 return -1;
7244
Victor Stinner7d00cc12014-03-17 23:08:06 +01007245 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7247 UnicodeDecodeError. */
7248 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7249 if (exc != NULL) {
7250 PyCodec_StrictErrors(exc);
7251 Py_CLEAR(exc);
7252 }
7253 goto error;
7254 }
7255
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007256 /* Extend a wchar_t* buffer */
7257 Py_ssize_t n = *bufsize; /* Get the current length */
7258 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7259 PyErr_NoMemory();
7260 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007262 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7263 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007265 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007266
7267 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 while (in < endin)
7269 {
7270 /* Decode a character */
7271 insize = 1;
7272 do
7273 {
7274 outsize = MultiByteToWideChar(code_page, flags,
7275 in, insize,
7276 buffer, Py_ARRAY_LENGTH(buffer));
7277 if (outsize > 0)
7278 break;
7279 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007280 if (err == ERROR_INVALID_FLAGS && flags) {
7281 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7282 flags = 0;
7283 continue;
7284 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007285 if (err != ERROR_NO_UNICODE_TRANSLATION
7286 && err != ERROR_INSUFFICIENT_BUFFER)
7287 {
7288 PyErr_SetFromWindowsErr(0);
7289 goto error;
7290 }
7291 insize++;
7292 }
7293 /* 4=maximum length of a UTF-8 sequence */
7294 while (insize <= 4 && (in + insize) <= endin);
7295
7296 if (outsize <= 0) {
7297 Py_ssize_t startinpos, endinpos, outpos;
7298
Victor Stinner7d00cc12014-03-17 23:08:06 +01007299 /* last character in partial decode? */
7300 if (in + insize >= endin && !final)
7301 break;
7302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 startinpos = in - startin;
7304 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007305 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007306 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 errors, &errorHandler,
7308 encoding, reason,
7309 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007310 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 {
7312 goto error;
7313 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007314 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 }
7316 else {
7317 in += insize;
7318 memcpy(out, buffer, outsize * sizeof(wchar_t));
7319 out += outsize;
7320 }
7321 }
7322
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007323 /* Shrink the buffer */
7324 assert(out - *buf <= *bufsize);
7325 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007326 /* (in - startin) <= size and size is an int */
7327 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007328
7329error:
7330 Py_XDECREF(encoding_obj);
7331 Py_XDECREF(errorHandler);
7332 Py_XDECREF(exc);
7333 return ret;
7334}
7335
Victor Stinner3a50e702011-10-18 21:21:00 +02007336static PyObject *
7337decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007338 const char *s, Py_ssize_t size,
7339 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007340{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007341 wchar_t *buf = NULL;
7342 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007343 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 if (code_page < 0) {
7346 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7347 return NULL;
7348 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007349 if (size < 0) {
7350 PyErr_BadInternalCall();
7351 return NULL;
7352 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007353
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007354 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356
Victor Stinner76a31a62011-11-04 00:05:13 +01007357 do
7358 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 if (size > INT_MAX) {
7361 chunk_size = INT_MAX;
7362 final = 0;
7363 done = 0;
7364 }
7365 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007367 {
7368 chunk_size = (int)size;
7369 final = (consumed == NULL);
7370 done = 1;
7371 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007372
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007374 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007375 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007376 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007379 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007380 s, chunk_size);
7381 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007382 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007383 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007384 errors, final);
7385 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007386
7387 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007388 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 return NULL;
7390 }
7391
7392 if (consumed)
7393 *consumed += converted;
7394
7395 s += converted;
7396 size -= converted;
7397 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007398
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007399 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7400 PyMem_Free(buf);
7401 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007402}
7403
Alexander Belopolsky40018472011-02-26 01:02:56 +00007404PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007405PyUnicode_DecodeCodePageStateful(int code_page,
7406 const char *s,
7407 Py_ssize_t size,
7408 const char *errors,
7409 Py_ssize_t *consumed)
7410{
7411 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7412}
7413
7414PyObject *
7415PyUnicode_DecodeMBCSStateful(const char *s,
7416 Py_ssize_t size,
7417 const char *errors,
7418 Py_ssize_t *consumed)
7419{
7420 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7421}
7422
7423PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007424PyUnicode_DecodeMBCS(const char *s,
7425 Py_ssize_t size,
7426 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007427{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007428 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7429}
7430
Victor Stinner3a50e702011-10-18 21:21:00 +02007431static DWORD
7432encode_code_page_flags(UINT code_page, const char *errors)
7433{
7434 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007435 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 }
7437 else if (code_page == CP_UTF7) {
7438 /* CP_UTF7 only supports flags=0 */
7439 return 0;
7440 }
7441 else {
7442 if (errors != NULL && strcmp(errors, "replace") == 0)
7443 return 0;
7444 else
7445 return WC_NO_BEST_FIT_CHARS;
7446 }
7447}
7448
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 * Encode a Unicode string to a Windows code page into a byte string in strict
7451 * mode.
7452 *
7453 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007454 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007455 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007456static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007457encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007458 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007460{
Victor Stinner554f3f02010-06-16 23:33:54 +00007461 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 BOOL *pusedDefaultChar = &usedDefaultChar;
7463 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007464 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 const DWORD flags = encode_code_page_flags(code_page, NULL);
7467 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 /* Create a substring so that we can get the UTF-16 representation
7469 of just the slice under consideration. */
7470 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471
Martin v. Löwis3d325192011-11-04 18:23:06 +01007472 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007473
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007475 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007477 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007478
Victor Stinner2fc507f2011-11-04 20:06:39 +01007479 substring = PyUnicode_Substring(unicode, offset, offset+len);
7480 if (substring == NULL)
7481 return -1;
7482 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7483 if (p == NULL) {
7484 Py_DECREF(substring);
7485 return -1;
7486 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007487 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007488
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007489 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007490 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007491 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 NULL, 0,
7493 NULL, pusedDefaultChar);
7494 if (outsize <= 0)
7495 goto error;
7496 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 if (pusedDefaultChar && *pusedDefaultChar) {
7498 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007500 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 if (*outbytes == NULL) {
7506 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007508 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510 }
7511 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 const Py_ssize_t n = PyBytes_Size(*outbytes);
7514 if (outsize > PY_SSIZE_T_MAX - n) {
7515 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007516 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7520 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007524 }
7525
7526 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007528 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 out, outsize,
7530 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007531 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 if (outsize <= 0)
7533 goto error;
7534 if (pusedDefaultChar && *pusedDefaultChar)
7535 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007537
Victor Stinner3a50e702011-10-18 21:21:00 +02007538error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007539 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7541 return -2;
7542 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007543 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007544}
7545
Victor Stinner3a50e702011-10-18 21:21:00 +02007546/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007547 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 * error handler.
7549 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007550 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 * -1 on other error.
7552 */
7553static int
7554encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007555 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007556 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007557{
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007559 Py_ssize_t pos = unicode_offset;
7560 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 /* Ideally, we should get reason from FormatMessage. This is the Windows
7562 2000 English version of the message. */
7563 const char *reason = "invalid character";
7564 /* 4=maximum length of a UTF-8 sequence */
7565 char buffer[4];
7566 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7567 Py_ssize_t outsize;
7568 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 PyObject *errorHandler = NULL;
7570 PyObject *exc = NULL;
7571 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007572 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007573 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 PyObject *rep;
7575 int ret = -1;
7576
7577 assert(insize > 0);
7578
7579 encoding = code_page_name(code_page, &encoding_obj);
7580 if (encoding == NULL)
7581 return -1;
7582
7583 if (errors == NULL || strcmp(errors, "strict") == 0) {
7584 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7585 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007586 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 if (exc != NULL) {
7588 PyCodec_StrictErrors(exc);
7589 Py_DECREF(exc);
7590 }
7591 Py_XDECREF(encoding_obj);
7592 return -1;
7593 }
7594
7595 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7596 pusedDefaultChar = &usedDefaultChar;
7597 else
7598 pusedDefaultChar = NULL;
7599
7600 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7601 PyErr_NoMemory();
7602 goto error;
7603 }
7604 outsize = insize * Py_ARRAY_LENGTH(buffer);
7605
7606 if (*outbytes == NULL) {
7607 /* Create string object */
7608 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7609 if (*outbytes == NULL)
7610 goto error;
7611 out = PyBytes_AS_STRING(*outbytes);
7612 }
7613 else {
7614 /* Extend string object */
7615 Py_ssize_t n = PyBytes_Size(*outbytes);
7616 if (n > PY_SSIZE_T_MAX - outsize) {
7617 PyErr_NoMemory();
7618 goto error;
7619 }
7620 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7621 goto error;
7622 out = PyBytes_AS_STRING(*outbytes) + n;
7623 }
7624
7625 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007626 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007627 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007628 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7629 wchar_t chars[2];
7630 int charsize;
7631 if (ch < 0x10000) {
7632 chars[0] = (wchar_t)ch;
7633 charsize = 1;
7634 }
7635 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007636 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7637 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007638 charsize = 2;
7639 }
7640
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007642 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 buffer, Py_ARRAY_LENGTH(buffer),
7644 NULL, pusedDefaultChar);
7645 if (outsize > 0) {
7646 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7647 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 memcpy(out, buffer, outsize);
7650 out += outsize;
7651 continue;
7652 }
7653 }
7654 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7655 PyErr_SetFromWindowsErr(0);
7656 goto error;
7657 }
7658
Victor Stinner3a50e702011-10-18 21:21:00 +02007659 rep = unicode_encode_call_errorhandler(
7660 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007661 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007662 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007663 if (rep == NULL)
7664 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007666
7667 if (PyBytes_Check(rep)) {
7668 outsize = PyBytes_GET_SIZE(rep);
7669 if (outsize != 1) {
7670 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7671 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7672 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7673 Py_DECREF(rep);
7674 goto error;
7675 }
7676 out = PyBytes_AS_STRING(*outbytes) + offset;
7677 }
7678 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7679 out += outsize;
7680 }
7681 else {
7682 Py_ssize_t i;
7683 enum PyUnicode_Kind kind;
7684 void *data;
7685
Benjamin Petersonbac79492012-01-14 13:34:47 -05007686 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007687 Py_DECREF(rep);
7688 goto error;
7689 }
7690
7691 outsize = PyUnicode_GET_LENGTH(rep);
7692 if (outsize != 1) {
7693 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7694 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7695 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7696 Py_DECREF(rep);
7697 goto error;
7698 }
7699 out = PyBytes_AS_STRING(*outbytes) + offset;
7700 }
7701 kind = PyUnicode_KIND(rep);
7702 data = PyUnicode_DATA(rep);
7703 for (i=0; i < outsize; i++) {
7704 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7705 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007706 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007707 encoding, unicode,
7708 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 "unable to encode error handler result to ASCII");
7710 Py_DECREF(rep);
7711 goto error;
7712 }
7713 *out = (unsigned char)ch;
7714 out++;
7715 }
7716 }
7717 Py_DECREF(rep);
7718 }
7719 /* write a NUL byte */
7720 *out = 0;
7721 outsize = out - PyBytes_AS_STRING(*outbytes);
7722 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7723 if (_PyBytes_Resize(outbytes, outsize) < 0)
7724 goto error;
7725 ret = 0;
7726
7727error:
7728 Py_XDECREF(encoding_obj);
7729 Py_XDECREF(errorHandler);
7730 Py_XDECREF(exc);
7731 return ret;
7732}
7733
Victor Stinner3a50e702011-10-18 21:21:00 +02007734static PyObject *
7735encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007736 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007737 const char *errors)
7738{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007739 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007741 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007742 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007743
Victor Stinner29dacf22015-01-26 16:41:32 +01007744 if (!PyUnicode_Check(unicode)) {
7745 PyErr_BadArgument();
7746 return NULL;
7747 }
7748
Benjamin Petersonbac79492012-01-14 13:34:47 -05007749 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007750 return NULL;
7751 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007752
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 if (code_page < 0) {
7754 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7755 return NULL;
7756 }
7757
Martin v. Löwis3d325192011-11-04 18:23:06 +01007758 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007759 return PyBytes_FromStringAndSize(NULL, 0);
7760
Victor Stinner7581cef2011-11-03 22:32:33 +01007761 offset = 0;
7762 do
7763 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007764#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007765 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007766 chunks. */
7767 if (len > INT_MAX/2) {
7768 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007769 done = 0;
7770 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007771 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007772#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007774 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007775 done = 1;
7776 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007777
Victor Stinner76a31a62011-11-04 00:05:13 +01007778 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007779 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007780 errors);
7781 if (ret == -2)
7782 ret = encode_code_page_errors(code_page, &outbytes,
7783 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007785 if (ret < 0) {
7786 Py_XDECREF(outbytes);
7787 return NULL;
7788 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007789
Victor Stinner7581cef2011-11-03 22:32:33 +01007790 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007791 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007792 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007793
Victor Stinner3a50e702011-10-18 21:21:00 +02007794 return outbytes;
7795}
7796
7797PyObject *
7798PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7799 Py_ssize_t size,
7800 const char *errors)
7801{
Victor Stinner7581cef2011-11-03 22:32:33 +01007802 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007803 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007804 if (unicode == NULL)
7805 return NULL;
7806 res = encode_code_page(CP_ACP, unicode, errors);
7807 Py_DECREF(unicode);
7808 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007809}
7810
7811PyObject *
7812PyUnicode_EncodeCodePage(int code_page,
7813 PyObject *unicode,
7814 const char *errors)
7815{
Victor Stinner7581cef2011-11-03 22:32:33 +01007816 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007817}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007818
Alexander Belopolsky40018472011-02-26 01:02:56 +00007819PyObject *
7820PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007821{
Victor Stinner7581cef2011-11-03 22:32:33 +01007822 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007823}
7824
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007825#undef NEED_RETRY
7826
Steve Dowercc16be82016-09-08 10:35:16 -07007827#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007828
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829/* --- Character Mapping Codec -------------------------------------------- */
7830
Victor Stinnerfb161b12013-04-18 01:44:27 +02007831static int
7832charmap_decode_string(const char *s,
7833 Py_ssize_t size,
7834 PyObject *mapping,
7835 const char *errors,
7836 _PyUnicodeWriter *writer)
7837{
7838 const char *starts = s;
7839 const char *e;
7840 Py_ssize_t startinpos, endinpos;
7841 PyObject *errorHandler = NULL, *exc = NULL;
7842 Py_ssize_t maplen;
7843 enum PyUnicode_Kind mapkind;
7844 void *mapdata;
7845 Py_UCS4 x;
7846 unsigned char ch;
7847
7848 if (PyUnicode_READY(mapping) == -1)
7849 return -1;
7850
7851 maplen = PyUnicode_GET_LENGTH(mapping);
7852 mapdata = PyUnicode_DATA(mapping);
7853 mapkind = PyUnicode_KIND(mapping);
7854
7855 e = s + size;
7856
7857 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7858 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7859 * is disabled in encoding aliases, latin1 is preferred because
7860 * its implementation is faster. */
7861 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7862 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7863 Py_UCS4 maxchar = writer->maxchar;
7864
7865 assert (writer->kind == PyUnicode_1BYTE_KIND);
7866 while (s < e) {
7867 ch = *s;
7868 x = mapdata_ucs1[ch];
7869 if (x > maxchar) {
7870 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7871 goto onError;
7872 maxchar = writer->maxchar;
7873 outdata = (Py_UCS1 *)writer->data;
7874 }
7875 outdata[writer->pos] = x;
7876 writer->pos++;
7877 ++s;
7878 }
7879 return 0;
7880 }
7881
7882 while (s < e) {
7883 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7884 enum PyUnicode_Kind outkind = writer->kind;
7885 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7886 if (outkind == PyUnicode_1BYTE_KIND) {
7887 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7888 Py_UCS4 maxchar = writer->maxchar;
7889 while (s < e) {
7890 ch = *s;
7891 x = mapdata_ucs2[ch];
7892 if (x > maxchar)
7893 goto Error;
7894 outdata[writer->pos] = x;
7895 writer->pos++;
7896 ++s;
7897 }
7898 break;
7899 }
7900 else if (outkind == PyUnicode_2BYTE_KIND) {
7901 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7902 while (s < e) {
7903 ch = *s;
7904 x = mapdata_ucs2[ch];
7905 if (x == 0xFFFE)
7906 goto Error;
7907 outdata[writer->pos] = x;
7908 writer->pos++;
7909 ++s;
7910 }
7911 break;
7912 }
7913 }
7914 ch = *s;
7915
7916 if (ch < maplen)
7917 x = PyUnicode_READ(mapkind, mapdata, ch);
7918 else
7919 x = 0xfffe; /* invalid value */
7920Error:
7921 if (x == 0xfffe)
7922 {
7923 /* undefined mapping */
7924 startinpos = s-starts;
7925 endinpos = startinpos+1;
7926 if (unicode_decode_call_errorhandler_writer(
7927 errors, &errorHandler,
7928 "charmap", "character maps to <undefined>",
7929 &starts, &e, &startinpos, &endinpos, &exc, &s,
7930 writer)) {
7931 goto onError;
7932 }
7933 continue;
7934 }
7935
7936 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7937 goto onError;
7938 ++s;
7939 }
7940 Py_XDECREF(errorHandler);
7941 Py_XDECREF(exc);
7942 return 0;
7943
7944onError:
7945 Py_XDECREF(errorHandler);
7946 Py_XDECREF(exc);
7947 return -1;
7948}
7949
7950static int
7951charmap_decode_mapping(const char *s,
7952 Py_ssize_t size,
7953 PyObject *mapping,
7954 const char *errors,
7955 _PyUnicodeWriter *writer)
7956{
7957 const char *starts = s;
7958 const char *e;
7959 Py_ssize_t startinpos, endinpos;
7960 PyObject *errorHandler = NULL, *exc = NULL;
7961 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007962 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007963
7964 e = s + size;
7965
7966 while (s < e) {
7967 ch = *s;
7968
7969 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7970 key = PyLong_FromLong((long)ch);
7971 if (key == NULL)
7972 goto onError;
7973
7974 item = PyObject_GetItem(mapping, key);
7975 Py_DECREF(key);
7976 if (item == NULL) {
7977 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7978 /* No mapping found means: mapping is undefined. */
7979 PyErr_Clear();
7980 goto Undefined;
7981 } else
7982 goto onError;
7983 }
7984
7985 /* Apply mapping */
7986 if (item == Py_None)
7987 goto Undefined;
7988 if (PyLong_Check(item)) {
7989 long value = PyLong_AS_LONG(item);
7990 if (value == 0xFFFE)
7991 goto Undefined;
7992 if (value < 0 || value > MAX_UNICODE) {
7993 PyErr_Format(PyExc_TypeError,
7994 "character mapping must be in range(0x%lx)",
7995 (unsigned long)MAX_UNICODE + 1);
7996 goto onError;
7997 }
7998
7999 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8000 goto onError;
8001 }
8002 else if (PyUnicode_Check(item)) {
8003 if (PyUnicode_READY(item) == -1)
8004 goto onError;
8005 if (PyUnicode_GET_LENGTH(item) == 1) {
8006 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8007 if (value == 0xFFFE)
8008 goto Undefined;
8009 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8010 goto onError;
8011 }
8012 else {
8013 writer->overallocate = 1;
8014 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8015 goto onError;
8016 }
8017 }
8018 else {
8019 /* wrong return value */
8020 PyErr_SetString(PyExc_TypeError,
8021 "character mapping must return integer, None or str");
8022 goto onError;
8023 }
8024 Py_CLEAR(item);
8025 ++s;
8026 continue;
8027
8028Undefined:
8029 /* undefined mapping */
8030 Py_CLEAR(item);
8031 startinpos = s-starts;
8032 endinpos = startinpos+1;
8033 if (unicode_decode_call_errorhandler_writer(
8034 errors, &errorHandler,
8035 "charmap", "character maps to <undefined>",
8036 &starts, &e, &startinpos, &endinpos, &exc, &s,
8037 writer)) {
8038 goto onError;
8039 }
8040 }
8041 Py_XDECREF(errorHandler);
8042 Py_XDECREF(exc);
8043 return 0;
8044
8045onError:
8046 Py_XDECREF(item);
8047 Py_XDECREF(errorHandler);
8048 Py_XDECREF(exc);
8049 return -1;
8050}
8051
Alexander Belopolsky40018472011-02-26 01:02:56 +00008052PyObject *
8053PyUnicode_DecodeCharmap(const char *s,
8054 Py_ssize_t size,
8055 PyObject *mapping,
8056 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008058 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008059
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 /* Default to Latin-1 */
8061 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008065 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008066 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008067 writer.min_length = size;
8068 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008070
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008071 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008072 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8073 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008074 }
8075 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008076 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008079 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008080
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008082 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 return NULL;
8084}
8085
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086/* Charmap encoding: the lookup table */
8087
Alexander Belopolsky40018472011-02-26 01:02:56 +00008088struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 PyObject_HEAD
8090 unsigned char level1[32];
8091 int count2, count3;
8092 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093};
8094
8095static PyObject*
8096encoding_map_size(PyObject *obj, PyObject* args)
8097{
8098 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101}
8102
8103static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008104 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 PyDoc_STR("Return the size (in bytes) of this object") },
8106 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107};
8108
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 "EncodingMap", /*tp_name*/
8112 sizeof(struct encoding_map), /*tp_basicsize*/
8113 0, /*tp_itemsize*/
8114 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008115 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008116 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 0, /*tp_getattr*/
8118 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008119 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 0, /*tp_repr*/
8121 0, /*tp_as_number*/
8122 0, /*tp_as_sequence*/
8123 0, /*tp_as_mapping*/
8124 0, /*tp_hash*/
8125 0, /*tp_call*/
8126 0, /*tp_str*/
8127 0, /*tp_getattro*/
8128 0, /*tp_setattro*/
8129 0, /*tp_as_buffer*/
8130 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8131 0, /*tp_doc*/
8132 0, /*tp_traverse*/
8133 0, /*tp_clear*/
8134 0, /*tp_richcompare*/
8135 0, /*tp_weaklistoffset*/
8136 0, /*tp_iter*/
8137 0, /*tp_iternext*/
8138 encoding_map_methods, /*tp_methods*/
8139 0, /*tp_members*/
8140 0, /*tp_getset*/
8141 0, /*tp_base*/
8142 0, /*tp_dict*/
8143 0, /*tp_descr_get*/
8144 0, /*tp_descr_set*/
8145 0, /*tp_dictoffset*/
8146 0, /*tp_init*/
8147 0, /*tp_alloc*/
8148 0, /*tp_new*/
8149 0, /*tp_free*/
8150 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151};
8152
8153PyObject*
8154PyUnicode_BuildEncodingMap(PyObject* string)
8155{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008156 PyObject *result;
8157 struct encoding_map *mresult;
8158 int i;
8159 int need_dict = 0;
8160 unsigned char level1[32];
8161 unsigned char level2[512];
8162 unsigned char *mlevel1, *mlevel2, *mlevel3;
8163 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 int kind;
8165 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008166 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008168
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008169 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 PyErr_BadArgument();
8171 return NULL;
8172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 kind = PyUnicode_KIND(string);
8174 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008175 length = PyUnicode_GET_LENGTH(string);
8176 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 memset(level1, 0xFF, sizeof level1);
8178 memset(level2, 0xFF, sizeof level2);
8179
8180 /* If there isn't a one-to-one mapping of NULL to \0,
8181 or if there are non-BMP characters, we need to use
8182 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008185 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 ch = PyUnicode_READ(kind, data, i);
8188 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189 need_dict = 1;
8190 break;
8191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 /* unmapped character */
8194 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 l1 = ch >> 11;
8196 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 if (level1[l1] == 0xFF)
8198 level1[l1] = count2++;
8199 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201 }
8202
8203 if (count2 >= 0xFF || count3 >= 0xFF)
8204 need_dict = 1;
8205
8206 if (need_dict) {
8207 PyObject *result = PyDict_New();
8208 PyObject *key, *value;
8209 if (!result)
8210 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008211 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008213 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008214 if (!key || !value)
8215 goto failed1;
8216 if (PyDict_SetItem(result, key, value) == -1)
8217 goto failed1;
8218 Py_DECREF(key);
8219 Py_DECREF(value);
8220 }
8221 return result;
8222 failed1:
8223 Py_XDECREF(key);
8224 Py_XDECREF(value);
8225 Py_DECREF(result);
8226 return NULL;
8227 }
8228
8229 /* Create a three-level trie */
8230 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8231 16*count2 + 128*count3 - 1);
8232 if (!result)
8233 return PyErr_NoMemory();
8234 PyObject_Init(result, &EncodingMapType);
8235 mresult = (struct encoding_map*)result;
8236 mresult->count2 = count2;
8237 mresult->count3 = count3;
8238 mlevel1 = mresult->level1;
8239 mlevel2 = mresult->level23;
8240 mlevel3 = mresult->level23 + 16*count2;
8241 memcpy(mlevel1, level1, 32);
8242 memset(mlevel2, 0xFF, 16*count2);
8243 memset(mlevel3, 0, 128*count3);
8244 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008245 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008247 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8248 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249 /* unmapped character */
8250 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008251 o1 = ch>>11;
8252 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 i2 = 16*mlevel1[o1] + o2;
8254 if (mlevel2[i2] == 0xFF)
8255 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008256 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257 i3 = 128*mlevel2[i2] + o3;
8258 mlevel3[i3] = i;
8259 }
8260 return result;
8261}
8262
8263static int
Victor Stinner22168992011-11-20 17:09:18 +01008264encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265{
8266 struct encoding_map *map = (struct encoding_map*)mapping;
8267 int l1 = c>>11;
8268 int l2 = (c>>7) & 0xF;
8269 int l3 = c & 0x7F;
8270 int i;
8271
Victor Stinner22168992011-11-20 17:09:18 +01008272 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008274 if (c == 0)
8275 return 0;
8276 /* level 1*/
8277 i = map->level1[l1];
8278 if (i == 0xFF) {
8279 return -1;
8280 }
8281 /* level 2*/
8282 i = map->level23[16*i+l2];
8283 if (i == 0xFF) {
8284 return -1;
8285 }
8286 /* level 3 */
8287 i = map->level23[16*map->count2 + 128*i + l3];
8288 if (i == 0) {
8289 return -1;
8290 }
8291 return i;
8292}
8293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294/* Lookup the character ch in the mapping. If the character
8295 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008296 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008297static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008298charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299{
Christian Heimes217cfd12007-12-02 14:31:20 +00008300 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 PyObject *x;
8302
8303 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 x = PyObject_GetItem(mapping, w);
8306 Py_DECREF(w);
8307 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8309 /* No mapping found means: mapping is undefined. */
8310 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008311 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 } else
8313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008315 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008317 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 long value = PyLong_AS_LONG(x);
8319 if (value < 0 || value > 255) {
8320 PyErr_SetString(PyExc_TypeError,
8321 "character mapping must be in range(256)");
8322 Py_DECREF(x);
8323 return NULL;
8324 }
8325 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008327 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008330 /* wrong return value */
8331 PyErr_Format(PyExc_TypeError,
8332 "character mapping must return integer, bytes or None, not %.400s",
8333 x->ob_type->tp_name);
8334 Py_DECREF(x);
8335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 }
8337}
8338
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008340charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008342 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8343 /* exponentially overallocate to minimize reallocations */
8344 if (requiredsize < 2*outsize)
8345 requiredsize = 2*outsize;
8346 if (_PyBytes_Resize(outobj, requiredsize))
8347 return -1;
8348 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349}
8350
Benjamin Peterson14339b62009-01-31 16:36:08 +00008351typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008355 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356 space is available. Return a new reference to the object that
8357 was put in the output buffer, or Py_None, if the mapping was undefined
8358 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008359 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008361charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008364 PyObject *rep;
8365 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008366 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367
Christian Heimes90aa7642007-12-19 02:45:37 +00008368 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008369 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008371 if (res == -1)
8372 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 if (outsize<requiredsize)
8374 if (charmapencode_resize(outobj, outpos, requiredsize))
8375 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008376 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 outstart[(*outpos)++] = (char)res;
8378 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008379 }
8380
8381 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 Py_DECREF(rep);
8386 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008387 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 if (PyLong_Check(rep)) {
8389 Py_ssize_t requiredsize = *outpos+1;
8390 if (outsize<requiredsize)
8391 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8392 Py_DECREF(rep);
8393 return enc_EXCEPTION;
8394 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008395 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 else {
8399 const char *repchars = PyBytes_AS_STRING(rep);
8400 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8401 Py_ssize_t requiredsize = *outpos+repsize;
8402 if (outsize<requiredsize)
8403 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8404 Py_DECREF(rep);
8405 return enc_EXCEPTION;
8406 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008407 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 memcpy(outstart + *outpos, repchars, repsize);
8409 *outpos += repsize;
8410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412 Py_DECREF(rep);
8413 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414}
8415
8416/* handle an error in PyUnicode_EncodeCharmap
8417 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008418static int
8419charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008420 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008422 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008423 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424{
8425 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008426 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008428 enum PyUnicode_Kind kind;
8429 void *data;
8430 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008432 Py_ssize_t collstartpos = *inpos;
8433 Py_ssize_t collendpos = *inpos+1;
8434 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008435 const char *encoding = "charmap";
8436 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008438 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008439 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440
Benjamin Petersonbac79492012-01-14 13:34:47 -05008441 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008442 return -1;
8443 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 /* find all unencodable characters */
8445 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008446 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008447 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008448 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008449 val = encoding_map_lookup(ch, mapping);
8450 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 break;
8452 ++collendpos;
8453 continue;
8454 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008455
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008456 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8457 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 if (rep==NULL)
8459 return -1;
8460 else if (rep!=Py_None) {
8461 Py_DECREF(rep);
8462 break;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 }
8467 /* cache callback name lookup
8468 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008469 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008470 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008471
8472 switch (*error_handler) {
8473 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008474 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008476
8477 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 x = charmapencode_output('?', mapping, res, respos);
8480 if (x==enc_EXCEPTION) {
8481 return -1;
8482 }
8483 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008484 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return -1;
8486 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 }
8488 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008489 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008490 *inpos = collendpos;
8491 break;
Victor Stinner50149202015-09-22 00:26:54 +02008492
8493 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 /* generate replacement (temporarily (mis)uses p) */
8495 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 char buffer[2+29+1+1];
8497 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008498 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 for (cp = buffer; *cp; ++cp) {
8500 x = charmapencode_output(*cp, mapping, res, respos);
8501 if (x==enc_EXCEPTION)
8502 return -1;
8503 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008504 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return -1;
8506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507 }
8508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 *inpos = collendpos;
8510 break;
Victor Stinner50149202015-09-22 00:26:54 +02008511
Benjamin Peterson14339b62009-01-31 16:36:08 +00008512 default:
Victor Stinner50149202015-09-22 00:26:54 +02008513 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008514 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008518 if (PyBytes_Check(repunicode)) {
8519 /* Directly copy bytes result to output. */
8520 Py_ssize_t outsize = PyBytes_Size(*res);
8521 Py_ssize_t requiredsize;
8522 repsize = PyBytes_Size(repunicode);
8523 requiredsize = *respos + repsize;
8524 if (requiredsize > outsize)
8525 /* Make room for all additional bytes. */
8526 if (charmapencode_resize(res, respos, requiredsize)) {
8527 Py_DECREF(repunicode);
8528 return -1;
8529 }
8530 memcpy(PyBytes_AsString(*res) + *respos,
8531 PyBytes_AsString(repunicode), repsize);
8532 *respos += repsize;
8533 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008534 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008535 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008536 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008537 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008538 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008539 Py_DECREF(repunicode);
8540 return -1;
8541 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008542 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008543 data = PyUnicode_DATA(repunicode);
8544 kind = PyUnicode_KIND(repunicode);
8545 for (index = 0; index < repsize; index++) {
8546 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8547 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008549 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 return -1;
8551 }
8552 else if (x==enc_FAILED) {
8553 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008554 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 return -1;
8556 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008557 }
8558 *inpos = newpos;
8559 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 }
8561 return 0;
8562}
8563
Alexander Belopolsky40018472011-02-26 01:02:56 +00008564PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008565_PyUnicode_EncodeCharmap(PyObject *unicode,
8566 PyObject *mapping,
8567 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 /* output object */
8570 PyObject *res = NULL;
8571 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008572 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008575 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008576 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008577 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008578 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008579 void *data;
8580 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581
Benjamin Petersonbac79492012-01-14 13:34:47 -05008582 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583 return NULL;
8584 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008585 data = PyUnicode_DATA(unicode);
8586 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008587
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 /* Default to Latin-1 */
8589 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 /* allocate enough for a simple encoding without
8593 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008594 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008595 if (res == NULL)
8596 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008597 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008601 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 if (x==enc_EXCEPTION) /* error */
8605 goto onError;
8606 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008609 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 &res, &respos)) {
8611 goto onError;
8612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008613 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 else
8615 /* done with this character => adjust input position */
8616 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008620 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008621 if (_PyBytes_Resize(&res, respos) < 0)
8622 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008625 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 return res;
8627
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008629 Py_XDECREF(res);
8630 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008631 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 return NULL;
8633}
8634
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008635/* Deprecated */
8636PyObject *
8637PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8638 Py_ssize_t size,
8639 PyObject *mapping,
8640 const char *errors)
8641{
8642 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008643 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008644 if (unicode == NULL)
8645 return NULL;
8646 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8647 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008648 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008649}
8650
Alexander Belopolsky40018472011-02-26 01:02:56 +00008651PyObject *
8652PyUnicode_AsCharmapString(PyObject *unicode,
8653 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654{
8655 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 PyErr_BadArgument();
8657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008659 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660}
8661
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663static void
8664make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008666 Py_ssize_t startpos, Py_ssize_t endpos,
8667 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 *exceptionObject = _PyUnicodeTranslateError_Create(
8671 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 }
8673 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8675 goto onError;
8676 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8677 goto onError;
8678 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8679 goto onError;
8680 return;
8681 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008682 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683 }
8684}
8685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686/* error handling callback helper:
8687 build arguments, call the callback and check the arguments,
8688 put the result into newpos and return the replacement string, which
8689 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008690static PyObject *
8691unicode_translate_call_errorhandler(const char *errors,
8692 PyObject **errorHandler,
8693 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008695 Py_ssize_t startpos, Py_ssize_t endpos,
8696 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008698 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008700 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 PyObject *restuple;
8702 PyObject *resunicode;
8703
8704 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 }
8709
8710 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008715 restuple = PyObject_CallFunctionObjArgs(
8716 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008720 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 Py_DECREF(restuple);
8722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008724 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 &resunicode, &i_newpos)) {
8726 Py_DECREF(restuple);
8727 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008729 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008731 else
8732 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008734 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 Py_DECREF(restuple);
8736 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008737 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738 Py_INCREF(resunicode);
8739 Py_DECREF(restuple);
8740 return resunicode;
8741}
8742
8743/* Lookup the character ch in the mapping and put the result in result,
8744 which must be decrefed by the caller.
8745 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008746static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748{
Christian Heimes217cfd12007-12-02 14:31:20 +00008749 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750 PyObject *x;
8751
8752 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008754 x = PyObject_GetItem(mapping, w);
8755 Py_DECREF(w);
8756 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8758 /* No mapping found means: use 1:1 mapping. */
8759 PyErr_Clear();
8760 *result = NULL;
8761 return 0;
8762 } else
8763 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764 }
8765 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 *result = x;
8767 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008769 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008771 if (value < 0 || value > MAX_UNICODE) {
8772 PyErr_Format(PyExc_ValueError,
8773 "character mapping must be in range(0x%x)",
8774 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 Py_DECREF(x);
8776 return -1;
8777 }
8778 *result = x;
8779 return 0;
8780 }
8781 else if (PyUnicode_Check(x)) {
8782 *result = x;
8783 return 0;
8784 }
8785 else {
8786 /* wrong return value */
8787 PyErr_SetString(PyExc_TypeError,
8788 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008789 Py_DECREF(x);
8790 return -1;
8791 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792}
Victor Stinner1194ea02014-04-04 19:37:40 +02008793
8794/* lookup the character, write the result into the writer.
8795 Return 1 if the result was written into the writer, return 0 if the mapping
8796 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008797static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008798charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8799 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008800{
Victor Stinner1194ea02014-04-04 19:37:40 +02008801 PyObject *item;
8802
8803 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008805
8806 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008808 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008811 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008813
8814 if (item == Py_None) {
8815 Py_DECREF(item);
8816 return 0;
8817 }
8818
8819 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008820 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8821 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8822 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008823 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8824 Py_DECREF(item);
8825 return -1;
8826 }
8827 Py_DECREF(item);
8828 return 1;
8829 }
8830
8831 if (!PyUnicode_Check(item)) {
8832 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008834 }
8835
8836 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8837 Py_DECREF(item);
8838 return -1;
8839 }
8840
8841 Py_DECREF(item);
8842 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843}
8844
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845static int
8846unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8847 Py_UCS1 *translate)
8848{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008849 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008850 int ret = 0;
8851
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 if (charmaptranslate_lookup(ch, mapping, &item)) {
8853 return -1;
8854 }
8855
8856 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008857 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008858 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008860 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008861 /* not found => default to 1:1 mapping */
8862 translate[ch] = ch;
8863 return 1;
8864 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008865 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008866 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008867 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8868 used it */
8869 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008870 /* invalid character or character outside ASCII:
8871 skip the fast translate */
8872 goto exit;
8873 }
8874 translate[ch] = (Py_UCS1)replace;
8875 }
8876 else if (PyUnicode_Check(item)) {
8877 Py_UCS4 replace;
8878
8879 if (PyUnicode_READY(item) == -1) {
8880 Py_DECREF(item);
8881 return -1;
8882 }
8883 if (PyUnicode_GET_LENGTH(item) != 1)
8884 goto exit;
8885
8886 replace = PyUnicode_READ_CHAR(item, 0);
8887 if (replace > 127)
8888 goto exit;
8889 translate[ch] = (Py_UCS1)replace;
8890 }
8891 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008892 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008893 goto exit;
8894 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008895 ret = 1;
8896
Benjamin Peterson1365de72014-04-07 20:15:41 -04008897 exit:
8898 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899 return ret;
8900}
8901
8902/* Fast path for ascii => ascii translation. Return 1 if the whole string
8903 was translated into writer, return 0 if the input string was partially
8904 translated into writer, raise an exception and return -1 on error. */
8905static int
8906unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008907 _PyUnicodeWriter *writer, int ignore,
8908 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909{
Victor Stinner872b2912014-04-05 14:27:07 +02008910 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911 Py_ssize_t len;
8912 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008913 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 len = PyUnicode_GET_LENGTH(input);
8916
Victor Stinner872b2912014-04-05 14:27:07 +02008917 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918
8919 in = PyUnicode_1BYTE_DATA(input);
8920 end = in + len;
8921
8922 assert(PyUnicode_IS_ASCII(writer->buffer));
8923 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8924 out = PyUnicode_1BYTE_DATA(writer->buffer);
8925
Victor Stinner872b2912014-04-05 14:27:07 +02008926 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008927 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008928 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008930 int translate = unicode_fast_translate_lookup(mapping, ch,
8931 ascii_table);
8932 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008933 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008934 if (translate == 0)
8935 goto exit;
8936 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 }
Victor Stinner872b2912014-04-05 14:27:07 +02008938 if (ch2 == 0xfe) {
8939 if (ignore)
8940 continue;
8941 goto exit;
8942 }
8943 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008945 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008946 }
Victor Stinner872b2912014-04-05 14:27:07 +02008947 res = 1;
8948
8949exit:
8950 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008951 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008952 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008953}
8954
Victor Stinner3222da22015-10-01 22:07:32 +02008955static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956_PyUnicode_TranslateCharmap(PyObject *input,
8957 PyObject *mapping,
8958 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008961 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 Py_ssize_t size, i;
8963 int kind;
8964 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008965 _PyUnicodeWriter writer;
8966 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008967 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008968 PyObject *errorHandler = NULL;
8969 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008970 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008971 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 PyErr_BadArgument();
8975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 if (PyUnicode_READY(input) == -1)
8979 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008980 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 kind = PyUnicode_KIND(input);
8982 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008984 if (size == 0)
8985 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008987 /* allocate enough for a simple 1:1 translation without
8988 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008989 _PyUnicodeWriter_Init(&writer);
8990 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Victor Stinner872b2912014-04-05 14:27:07 +02008993 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8994
Victor Stinner33798672016-03-01 21:59:58 +01008995 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008996 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008997 if (PyUnicode_IS_ASCII(input)) {
8998 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8999 if (res < 0) {
9000 _PyUnicodeWriter_Dealloc(&writer);
9001 return NULL;
9002 }
9003 if (res == 1)
9004 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009005 }
Victor Stinner33798672016-03-01 21:59:58 +01009006 else {
9007 i = 0;
9008 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009012 int translate;
9013 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9014 Py_ssize_t newpos;
9015 /* startpos for collecting untranslatable chars */
9016 Py_ssize_t collstart;
9017 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009018 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019
Victor Stinner1194ea02014-04-04 19:37:40 +02009020 ch = PyUnicode_READ(kind, data, i);
9021 translate = charmaptranslate_output(ch, mapping, &writer);
9022 if (translate < 0)
9023 goto onError;
9024
9025 if (translate != 0) {
9026 /* it worked => adjust input pointer */
9027 ++i;
9028 continue;
9029 }
9030
9031 /* untranslatable character */
9032 collstart = i;
9033 collend = i+1;
9034
9035 /* find all untranslatable characters */
9036 while (collend < size) {
9037 PyObject *x;
9038 ch = PyUnicode_READ(kind, data, collend);
9039 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009040 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009041 Py_XDECREF(x);
9042 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009044 ++collend;
9045 }
9046
9047 if (ignore) {
9048 i = collend;
9049 }
9050 else {
9051 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9052 reason, input, &exc,
9053 collstart, collend, &newpos);
9054 if (repunicode == NULL)
9055 goto onError;
9056 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009058 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009059 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009060 Py_DECREF(repunicode);
9061 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009062 }
9063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009064 Py_XDECREF(exc);
9065 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009066 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009069 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009070 Py_XDECREF(exc);
9071 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072 return NULL;
9073}
9074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075/* Deprecated. Use PyUnicode_Translate instead. */
9076PyObject *
9077PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9078 Py_ssize_t size,
9079 PyObject *mapping,
9080 const char *errors)
9081{
Christian Heimes5f520f42012-09-11 14:03:25 +02009082 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009083 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 if (!unicode)
9085 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009086 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9087 Py_DECREF(unicode);
9088 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089}
9090
Alexander Belopolsky40018472011-02-26 01:02:56 +00009091PyObject *
9092PyUnicode_Translate(PyObject *str,
9093 PyObject *mapping,
9094 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009096 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009097 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009098 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099}
Tim Petersced69f82003-09-16 20:30:58 +00009100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101PyObject *
9102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9103{
9104 if (!PyUnicode_Check(unicode)) {
9105 PyErr_BadInternalCall();
9106 return NULL;
9107 }
9108 if (PyUnicode_READY(unicode) == -1)
9109 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009110 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 /* If the string is already ASCII, just return the same string */
9112 Py_INCREF(unicode);
9113 return unicode;
9114 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009115
9116 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9117 PyObject *result = PyUnicode_New(len, 127);
9118 if (result == NULL) {
9119 return NULL;
9120 }
9121
9122 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9123 int kind = PyUnicode_KIND(unicode);
9124 const void *data = PyUnicode_DATA(unicode);
9125 Py_ssize_t i;
9126 for (i = 0; i < len; ++i) {
9127 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9128 if (ch < 127) {
9129 out[i] = ch;
9130 }
9131 else if (Py_UNICODE_ISSPACE(ch)) {
9132 out[i] = ' ';
9133 }
9134 else {
9135 int decimal = Py_UNICODE_TODECIMAL(ch);
9136 if (decimal < 0) {
9137 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009138 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009139 _PyUnicode_LENGTH(result) = i + 1;
9140 break;
9141 }
9142 out[i] = '0' + decimal;
9143 }
9144 }
9145
INADA Naoki16dfca42018-07-14 12:06:43 +09009146 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009147 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148}
9149
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009150PyObject *
9151PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9152 Py_ssize_t length)
9153{
Victor Stinnerf0124502011-11-21 23:12:56 +01009154 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009155 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009156 Py_UCS4 maxchar;
9157 enum PyUnicode_Kind kind;
9158 void *data;
9159
Victor Stinner99d7ad02012-02-22 13:37:39 +01009160 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009161 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009162 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009163 if (ch > 127) {
9164 int decimal = Py_UNICODE_TODECIMAL(ch);
9165 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009166 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009167 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009168 }
9169 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009170
9171 /* Copy to a new string */
9172 decimal = PyUnicode_New(length, maxchar);
9173 if (decimal == NULL)
9174 return decimal;
9175 kind = PyUnicode_KIND(decimal);
9176 data = PyUnicode_DATA(decimal);
9177 /* Iterate over code points */
9178 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009179 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009180 if (ch > 127) {
9181 int decimal = Py_UNICODE_TODECIMAL(ch);
9182 if (decimal >= 0)
9183 ch = '0' + decimal;
9184 }
9185 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009187 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009188}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009189/* --- Decimal Encoder ---------------------------------------------------- */
9190
Alexander Belopolsky40018472011-02-26 01:02:56 +00009191int
9192PyUnicode_EncodeDecimal(Py_UNICODE *s,
9193 Py_ssize_t length,
9194 char *output,
9195 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009196{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009197 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009198 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009199 enum PyUnicode_Kind kind;
9200 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009201
9202 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 PyErr_BadArgument();
9204 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009205 }
9206
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009207 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009208 if (unicode == NULL)
9209 return -1;
9210
Victor Stinner42bf7752011-11-21 22:52:58 +01009211 kind = PyUnicode_KIND(unicode);
9212 data = PyUnicode_DATA(unicode);
9213
Victor Stinnerb84d7232011-11-22 01:50:07 +01009214 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009215 PyObject *exc;
9216 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009218 Py_ssize_t startpos;
9219
9220 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009221
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009223 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009224 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009225 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009226 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 decimal = Py_UNICODE_TODECIMAL(ch);
9228 if (decimal >= 0) {
9229 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009230 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 continue;
9232 }
9233 if (0 < ch && ch < 256) {
9234 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009235 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 continue;
9237 }
Victor Stinner6345be92011-11-25 20:09:01 +01009238
Victor Stinner42bf7752011-11-21 22:52:58 +01009239 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009240 exc = NULL;
9241 raise_encode_exception(&exc, "decimal", unicode,
9242 startpos, startpos+1,
9243 "invalid decimal Unicode string");
9244 Py_XDECREF(exc);
9245 Py_DECREF(unicode);
9246 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009247 }
9248 /* 0-terminate the output string */
9249 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009250 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009251 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009252}
9253
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254/* --- Helpers ------------------------------------------------------------ */
9255
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009256/* helper macro to fixup start/end slice values */
9257#define ADJUST_INDICES(start, end, len) \
9258 if (end > len) \
9259 end = len; \
9260 else if (end < 0) { \
9261 end += len; \
9262 if (end < 0) \
9263 end = 0; \
9264 } \
9265 if (start < 0) { \
9266 start += len; \
9267 if (start < 0) \
9268 start = 0; \
9269 }
9270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009272any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009274 Py_ssize_t end,
9275 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009277 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 void *buf1, *buf2;
9279 Py_ssize_t len1, len2, result;
9280
9281 kind1 = PyUnicode_KIND(s1);
9282 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009283 if (kind1 < kind2)
9284 return -1;
9285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 len1 = PyUnicode_GET_LENGTH(s1);
9287 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009288 ADJUST_INDICES(start, end, len1);
9289 if (end - start < len2)
9290 return -1;
9291
9292 buf1 = PyUnicode_DATA(s1);
9293 buf2 = PyUnicode_DATA(s2);
9294 if (len2 == 1) {
9295 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9296 result = findchar((const char *)buf1 + kind1*start,
9297 kind1, end - start, ch, direction);
9298 if (result == -1)
9299 return -1;
9300 else
9301 return start + result;
9302 }
9303
9304 if (kind2 != kind1) {
9305 buf2 = _PyUnicode_AsKind(s2, kind1);
9306 if (!buf2)
9307 return -2;
9308 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309
Victor Stinner794d5672011-10-10 03:21:36 +02009310 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009311 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009312 case PyUnicode_1BYTE_KIND:
9313 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9314 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9315 else
9316 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9317 break;
9318 case PyUnicode_2BYTE_KIND:
9319 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9320 break;
9321 case PyUnicode_4BYTE_KIND:
9322 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9323 break;
9324 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009325 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009326 }
9327 }
9328 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009329 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009330 case PyUnicode_1BYTE_KIND:
9331 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9332 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9333 else
9334 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9335 break;
9336 case PyUnicode_2BYTE_KIND:
9337 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9338 break;
9339 case PyUnicode_4BYTE_KIND:
9340 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9341 break;
9342 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009343 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 }
9346
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009347 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 PyMem_Free(buf2);
9349
9350 return result;
9351}
9352
Victor Stinner59423e32018-11-26 13:40:01 +01009353/* _PyUnicode_InsertThousandsGrouping() helper functions */
9354#include "stringlib/localeutil.h"
9355
9356/**
9357 * InsertThousandsGrouping:
9358 * @writer: Unicode writer.
9359 * @n_buffer: Number of characters in @buffer.
9360 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9361 * @d_pos: Start of digits string.
9362 * @n_digits: The number of digits in the string, in which we want
9363 * to put the grouping chars.
9364 * @min_width: The minimum width of the digits in the output string.
9365 * Output will be zero-padded on the left to fill.
9366 * @grouping: see definition in localeconv().
9367 * @thousands_sep: see definition in localeconv().
9368 *
9369 * There are 2 modes: counting and filling. If @writer is NULL,
9370 * we are in counting mode, else filling mode.
9371 * If counting, the required buffer size is returned.
9372 * If filling, we know the buffer will be large enough, so we don't
9373 * need to pass in the buffer size.
9374 * Inserts thousand grouping characters (as defined by grouping and
9375 * thousands_sep) into @writer.
9376 *
9377 * Return value: -1 on error, number of characters otherwise.
9378 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009380_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009381 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009383 PyObject *digits,
9384 Py_ssize_t d_pos,
9385 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009387 const char *grouping,
9388 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009389 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390{
Xtreak3f7983a2019-01-07 20:39:14 +05309391 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009392 if (writer) {
9393 assert(digits != NULL);
9394 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009395 }
9396 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009397 assert(digits == NULL);
9398 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009399 }
Victor Stinner59423e32018-11-26 13:40:01 +01009400 assert(0 <= d_pos);
9401 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009402 assert(grouping != NULL);
9403
9404 if (digits != NULL) {
9405 if (PyUnicode_READY(digits) == -1) {
9406 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009407 }
Victor Stinner59423e32018-11-26 13:40:01 +01009408 }
9409 if (PyUnicode_READY(thousands_sep) == -1) {
9410 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009411 }
9412
Victor Stinner59423e32018-11-26 13:40:01 +01009413 Py_ssize_t count = 0;
9414 Py_ssize_t n_zeros;
9415 int loop_broken = 0;
9416 int use_separator = 0; /* First time through, don't append the
9417 separator. They only go between
9418 groups. */
9419 Py_ssize_t buffer_pos;
9420 Py_ssize_t digits_pos;
9421 Py_ssize_t len;
9422 Py_ssize_t n_chars;
9423 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9424 be looked at */
9425 /* A generator that returns all of the grouping widths, until it
9426 returns 0. */
9427 GroupGenerator groupgen;
9428 GroupGenerator_init(&groupgen, grouping);
9429 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9430
9431 /* if digits are not grouped, thousands separator
9432 should be an empty string */
9433 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9434
9435 digits_pos = d_pos + n_digits;
9436 if (writer) {
9437 buffer_pos = writer->pos + n_buffer;
9438 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9439 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 }
Victor Stinner59423e32018-11-26 13:40:01 +01009441 else {
9442 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009443 }
Victor Stinner59423e32018-11-26 13:40:01 +01009444
9445 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009446 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009447 }
Victor Stinner59423e32018-11-26 13:40:01 +01009448
9449 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9450 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9451 n_zeros = Py_MAX(0, len - remaining);
9452 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9453
9454 /* Use n_zero zero's and n_chars chars */
9455
9456 /* Count only, don't do anything. */
9457 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9458
9459 /* Copy into the writer. */
9460 InsertThousandsGrouping_fill(writer, &buffer_pos,
9461 digits, &digits_pos,
9462 n_chars, n_zeros,
9463 use_separator ? thousands_sep : NULL,
9464 thousands_sep_len, maxchar);
9465
9466 /* Use a separator next time. */
9467 use_separator = 1;
9468
9469 remaining -= n_chars;
9470 min_width -= len;
9471
9472 if (remaining <= 0 && min_width <= 0) {
9473 loop_broken = 1;
9474 break;
9475 }
9476 min_width -= thousands_sep_len;
9477 }
9478 if (!loop_broken) {
9479 /* We left the loop without using a break statement. */
9480
9481 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9482 n_zeros = Py_MAX(0, len - remaining);
9483 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9484
9485 /* Use n_zero zero's and n_chars chars */
9486 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9487
9488 /* Copy into the writer. */
9489 InsertThousandsGrouping_fill(writer, &buffer_pos,
9490 digits, &digits_pos,
9491 n_chars, n_zeros,
9492 use_separator ? thousands_sep : NULL,
9493 thousands_sep_len, maxchar);
9494 }
9495 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496}
9497
9498
Alexander Belopolsky40018472011-02-26 01:02:56 +00009499Py_ssize_t
9500PyUnicode_Count(PyObject *str,
9501 PyObject *substr,
9502 Py_ssize_t start,
9503 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009505 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009506 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 void *buf1 = NULL, *buf2 = NULL;
9508 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009509
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009510 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009511 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009512
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009513 kind1 = PyUnicode_KIND(str);
9514 kind2 = PyUnicode_KIND(substr);
9515 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009516 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009517
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009518 len1 = PyUnicode_GET_LENGTH(str);
9519 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009521 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009522 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009523
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009524 buf1 = PyUnicode_DATA(str);
9525 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009526 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009527 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009528 if (!buf2)
9529 goto onError;
9530 }
9531
9532 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009534 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009535 result = asciilib_count(
9536 ((Py_UCS1*)buf1) + start, end - start,
9537 buf2, len2, PY_SSIZE_T_MAX
9538 );
9539 else
9540 result = ucs1lib_count(
9541 ((Py_UCS1*)buf1) + start, end - start,
9542 buf2, len2, PY_SSIZE_T_MAX
9543 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 break;
9545 case PyUnicode_2BYTE_KIND:
9546 result = ucs2lib_count(
9547 ((Py_UCS2*)buf1) + start, end - start,
9548 buf2, len2, PY_SSIZE_T_MAX
9549 );
9550 break;
9551 case PyUnicode_4BYTE_KIND:
9552 result = ucs4lib_count(
9553 ((Py_UCS4*)buf1) + start, end - start,
9554 buf2, len2, PY_SSIZE_T_MAX
9555 );
9556 break;
9557 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009558 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009560
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009561 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 PyMem_Free(buf2);
9563
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009566 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 PyMem_Free(buf2);
9568 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569}
9570
Alexander Belopolsky40018472011-02-26 01:02:56 +00009571Py_ssize_t
9572PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009573 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574 Py_ssize_t start,
9575 Py_ssize_t end,
9576 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009578 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009580
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009581 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582}
9583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584Py_ssize_t
9585PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9586 Py_ssize_t start, Py_ssize_t end,
9587 int direction)
9588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009590 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 if (PyUnicode_READY(str) == -1)
9592 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009593 len = PyUnicode_GET_LENGTH(str);
9594 ADJUST_INDICES(start, end, len);
9595 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009598 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9599 kind, end-start, ch, direction);
9600 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009602 else
9603 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604}
9605
Alexander Belopolsky40018472011-02-26 01:02:56 +00009606static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009607tailmatch(PyObject *self,
9608 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009609 Py_ssize_t start,
9610 Py_ssize_t end,
9611 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 int kind_self;
9614 int kind_sub;
9615 void *data_self;
9616 void *data_sub;
9617 Py_ssize_t offset;
9618 Py_ssize_t i;
9619 Py_ssize_t end_sub;
9620
9621 if (PyUnicode_READY(self) == -1 ||
9622 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009623 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9626 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009627 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009630 if (PyUnicode_GET_LENGTH(substring) == 0)
9631 return 1;
9632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 kind_self = PyUnicode_KIND(self);
9634 data_self = PyUnicode_DATA(self);
9635 kind_sub = PyUnicode_KIND(substring);
9636 data_sub = PyUnicode_DATA(substring);
9637 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9638
9639 if (direction > 0)
9640 offset = end;
9641 else
9642 offset = start;
9643
9644 if (PyUnicode_READ(kind_self, data_self, offset) ==
9645 PyUnicode_READ(kind_sub, data_sub, 0) &&
9646 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9647 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9648 /* If both are of the same kind, memcmp is sufficient */
9649 if (kind_self == kind_sub) {
9650 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009651 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 data_sub,
9653 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009654 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009656 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 else {
9658 /* We do not need to compare 0 and len(substring)-1 because
9659 the if statement above ensured already that they are equal
9660 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 for (i = 1; i < end_sub; ++i) {
9662 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9663 PyUnicode_READ(kind_sub, data_sub, i))
9664 return 0;
9665 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009666 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 }
9669
9670 return 0;
9671}
9672
Alexander Belopolsky40018472011-02-26 01:02:56 +00009673Py_ssize_t
9674PyUnicode_Tailmatch(PyObject *str,
9675 PyObject *substr,
9676 Py_ssize_t start,
9677 Py_ssize_t end,
9678 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009680 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009682
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009683 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009684}
9685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686static PyObject *
9687ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009689 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9690 char *resdata, *data = PyUnicode_DATA(self);
9691 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009692
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 res = PyUnicode_New(len, 127);
9694 if (res == NULL)
9695 return NULL;
9696 resdata = PyUnicode_DATA(res);
9697 if (lower)
9698 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 _Py_bytes_upper(resdata, data, len);
9701 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702}
9703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 Py_ssize_t j;
9708 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009709 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009710 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009711
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9713
9714 where ! is a negation and \p{xxx} is a character with property xxx.
9715 */
9716 for (j = i - 1; j >= 0; j--) {
9717 c = PyUnicode_READ(kind, data, j);
9718 if (!_PyUnicode_IsCaseIgnorable(c))
9719 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009720 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009721 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9722 if (final_sigma) {
9723 for (j = i + 1; j < length; j++) {
9724 c = PyUnicode_READ(kind, data, j);
9725 if (!_PyUnicode_IsCaseIgnorable(c))
9726 break;
9727 }
9728 final_sigma = j == length || !_PyUnicode_IsCased(c);
9729 }
9730 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731}
9732
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009733static int
9734lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9735 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 /* Obscure special case. */
9738 if (c == 0x3A3) {
9739 mapped[0] = handle_capital_sigma(kind, data, length, i);
9740 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743}
9744
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009745static Py_ssize_t
9746do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009748 Py_ssize_t i, k = 0;
9749 int n_res, j;
9750 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009751
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009753 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009758 for (i = 1; i < length; i++) {
9759 c = PyUnicode_READ(kind, data, i);
9760 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9761 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009762 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009764 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009765 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767}
9768
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009769static Py_ssize_t
9770do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9771 Py_ssize_t i, k = 0;
9772
9773 for (i = 0; i < length; i++) {
9774 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9775 int n_res, j;
9776 if (Py_UNICODE_ISUPPER(c)) {
9777 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9778 }
9779 else if (Py_UNICODE_ISLOWER(c)) {
9780 n_res = _PyUnicode_ToUpperFull(c, mapped);
9781 }
9782 else {
9783 n_res = 1;
9784 mapped[0] = c;
9785 }
9786 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009787 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788 res[k++] = mapped[j];
9789 }
9790 }
9791 return k;
9792}
9793
9794static Py_ssize_t
9795do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9796 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009798 Py_ssize_t i, k = 0;
9799
9800 for (i = 0; i < length; i++) {
9801 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9802 int n_res, j;
9803 if (lower)
9804 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9805 else
9806 n_res = _PyUnicode_ToUpperFull(c, mapped);
9807 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009808 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809 res[k++] = mapped[j];
9810 }
9811 }
9812 return k;
9813}
9814
9815static Py_ssize_t
9816do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9817{
9818 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9819}
9820
9821static Py_ssize_t
9822do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9823{
9824 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9825}
9826
Benjamin Petersone51757f2012-01-12 21:10:29 -05009827static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009828do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9829{
9830 Py_ssize_t i, k = 0;
9831
9832 for (i = 0; i < length; i++) {
9833 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9834 Py_UCS4 mapped[3];
9835 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9836 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009837 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009838 res[k++] = mapped[j];
9839 }
9840 }
9841 return k;
9842}
9843
9844static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009845do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9846{
9847 Py_ssize_t i, k = 0;
9848 int previous_is_cased;
9849
9850 previous_is_cased = 0;
9851 for (i = 0; i < length; i++) {
9852 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9853 Py_UCS4 mapped[3];
9854 int n_res, j;
9855
9856 if (previous_is_cased)
9857 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9858 else
9859 n_res = _PyUnicode_ToTitleFull(c, mapped);
9860
9861 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009862 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009863 res[k++] = mapped[j];
9864 }
9865
9866 previous_is_cased = _PyUnicode_IsCased(c);
9867 }
9868 return k;
9869}
9870
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009871static PyObject *
9872case_operation(PyObject *self,
9873 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9874{
9875 PyObject *res = NULL;
9876 Py_ssize_t length, newlength = 0;
9877 int kind, outkind;
9878 void *data, *outdata;
9879 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9880
Benjamin Petersoneea48462012-01-16 14:28:50 -05009881 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009882
9883 kind = PyUnicode_KIND(self);
9884 data = PyUnicode_DATA(self);
9885 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009886 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009887 PyErr_SetString(PyExc_OverflowError, "string is too long");
9888 return NULL;
9889 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009890 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009891 if (tmp == NULL)
9892 return PyErr_NoMemory();
9893 newlength = perform(kind, data, length, tmp, &maxchar);
9894 res = PyUnicode_New(newlength, maxchar);
9895 if (res == NULL)
9896 goto leave;
9897 tmpend = tmp + newlength;
9898 outdata = PyUnicode_DATA(res);
9899 outkind = PyUnicode_KIND(res);
9900 switch (outkind) {
9901 case PyUnicode_1BYTE_KIND:
9902 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9903 break;
9904 case PyUnicode_2BYTE_KIND:
9905 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9906 break;
9907 case PyUnicode_4BYTE_KIND:
9908 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9909 break;
9910 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009911 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009912 }
9913 leave:
9914 PyMem_FREE(tmp);
9915 return res;
9916}
9917
Tim Peters8ce9f162004-08-27 01:49:32 +00009918PyObject *
9919PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009921 PyObject *res;
9922 PyObject *fseq;
9923 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009926 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009927 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009929 }
9930
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009931 /* NOTE: the following code can't call back into Python code,
9932 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009933 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009934
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009935 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009936 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009937 res = _PyUnicode_JoinArray(separator, items, seqlen);
9938 Py_DECREF(fseq);
9939 return res;
9940}
9941
9942PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009943_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009944{
9945 PyObject *res = NULL; /* the result */
9946 PyObject *sep = NULL;
9947 Py_ssize_t seplen;
9948 PyObject *item;
9949 Py_ssize_t sz, i, res_offset;
9950 Py_UCS4 maxchar;
9951 Py_UCS4 item_maxchar;
9952 int use_memcpy;
9953 unsigned char *res_data = NULL, *sep_data = NULL;
9954 PyObject *last_obj;
9955 unsigned int kind = 0;
9956
Tim Peters05eba1f2004-08-27 21:32:02 +00009957 /* If empty sequence, return u"". */
9958 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009959 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009960 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009961
Tim Peters05eba1f2004-08-27 21:32:02 +00009962 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009963 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009964 if (seqlen == 1) {
9965 if (PyUnicode_CheckExact(items[0])) {
9966 res = items[0];
9967 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009968 return res;
9969 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009970 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009971 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009972 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009973 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009974 /* Set up sep and seplen */
9975 if (separator == NULL) {
9976 /* fall back to a blank space separator */
9977 sep = PyUnicode_FromOrdinal(' ');
9978 if (!sep)
9979 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009980 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009981 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009982 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009983 else {
9984 if (!PyUnicode_Check(separator)) {
9985 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009986 "separator: expected str instance,"
9987 " %.80s found",
9988 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009989 goto onError;
9990 }
9991 if (PyUnicode_READY(separator))
9992 goto onError;
9993 sep = separator;
9994 seplen = PyUnicode_GET_LENGTH(separator);
9995 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9996 /* inc refcount to keep this code path symmetric with the
9997 above case of a blank separator */
9998 Py_INCREF(sep);
9999 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010000 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010001 }
10002
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 /* There are at least two things to join, or else we have a subclass
10004 * of str in the sequence.
10005 * Do a pre-pass to figure out the total amount of space we'll
10006 * need (sz), and see whether all argument are strings.
10007 */
10008 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010009#ifdef Py_DEBUG
10010 use_memcpy = 0;
10011#else
10012 use_memcpy = 1;
10013#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010015 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010016 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010017 if (!PyUnicode_Check(item)) {
10018 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010019 "sequence item %zd: expected str instance,"
10020 " %.80s found",
10021 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010022 goto onError;
10023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (PyUnicode_READY(item) == -1)
10025 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010026 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010028 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010029 if (i != 0) {
10030 add_sz += seplen;
10031 }
10032 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010033 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010035 goto onError;
10036 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010037 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 if (use_memcpy && last_obj != NULL) {
10039 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10040 use_memcpy = 0;
10041 }
10042 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010043 }
Tim Petersced69f82003-09-16 20:30:58 +000010044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010046 if (res == NULL)
10047 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010048
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010049 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010050#ifdef Py_DEBUG
10051 use_memcpy = 0;
10052#else
10053 if (use_memcpy) {
10054 res_data = PyUnicode_1BYTE_DATA(res);
10055 kind = PyUnicode_KIND(res);
10056 if (seplen != 0)
10057 sep_data = PyUnicode_1BYTE_DATA(sep);
10058 }
10059#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010060 if (use_memcpy) {
10061 for (i = 0; i < seqlen; ++i) {
10062 Py_ssize_t itemlen;
10063 item = items[i];
10064
10065 /* Copy item, and maybe the separator. */
10066 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010067 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010068 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010069 kind * seplen);
10070 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010071 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010072
10073 itemlen = PyUnicode_GET_LENGTH(item);
10074 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010075 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010076 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 kind * itemlen);
10078 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010079 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010080 }
10081 assert(res_data == PyUnicode_1BYTE_DATA(res)
10082 + kind * PyUnicode_GET_LENGTH(res));
10083 }
10084 else {
10085 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10086 Py_ssize_t itemlen;
10087 item = items[i];
10088
10089 /* Copy item, and maybe the separator. */
10090 if (i && seplen != 0) {
10091 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10092 res_offset += seplen;
10093 }
10094
10095 itemlen = PyUnicode_GET_LENGTH(item);
10096 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010097 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010098 res_offset += itemlen;
10099 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010100 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010101 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010102 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010105 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010110 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111 return NULL;
10112}
10113
Victor Stinnerd3f08822012-05-29 12:57:52 +020010114void
10115_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10116 Py_UCS4 fill_char)
10117{
10118 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010119 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010120 assert(PyUnicode_IS_READY(unicode));
10121 assert(unicode_modifiable(unicode));
10122 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10123 assert(start >= 0);
10124 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010125 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010126}
10127
Victor Stinner3fe55312012-01-04 00:33:50 +010010128Py_ssize_t
10129PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10130 Py_UCS4 fill_char)
10131{
10132 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010133
10134 if (!PyUnicode_Check(unicode)) {
10135 PyErr_BadInternalCall();
10136 return -1;
10137 }
10138 if (PyUnicode_READY(unicode) == -1)
10139 return -1;
10140 if (unicode_check_modifiable(unicode))
10141 return -1;
10142
Victor Stinnerd3f08822012-05-29 12:57:52 +020010143 if (start < 0) {
10144 PyErr_SetString(PyExc_IndexError, "string index out of range");
10145 return -1;
10146 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010147 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10148 PyErr_SetString(PyExc_ValueError,
10149 "fill character is bigger than "
10150 "the string maximum character");
10151 return -1;
10152 }
10153
10154 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10155 length = Py_MIN(maxlen, length);
10156 if (length <= 0)
10157 return 0;
10158
Victor Stinnerd3f08822012-05-29 12:57:52 +020010159 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010160 return length;
10161}
10162
Victor Stinner9310abb2011-10-05 00:59:23 +020010163static PyObject *
10164pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010165 Py_ssize_t left,
10166 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 PyObject *u;
10170 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010171 int kind;
10172 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173
10174 if (left < 0)
10175 left = 0;
10176 if (right < 0)
10177 right = 0;
10178
Victor Stinnerc4b49542011-12-11 22:44:26 +010010179 if (left == 0 && right == 0)
10180 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10183 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010184 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10185 return NULL;
10186 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010188 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010190 if (!u)
10191 return NULL;
10192
10193 kind = PyUnicode_KIND(u);
10194 data = PyUnicode_DATA(u);
10195 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010196 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010197 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010198 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010199 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010200 assert(_PyUnicode_CheckConsistency(u, 1));
10201 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010202}
10203
Alexander Belopolsky40018472011-02-26 01:02:56 +000010204PyObject *
10205PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010209 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211
Benjamin Petersonead6b532011-12-20 17:23:42 -060010212 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 if (PyUnicode_IS_ASCII(string))
10215 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010216 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010217 PyUnicode_GET_LENGTH(string), keepends);
10218 else
10219 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010220 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 break;
10223 case PyUnicode_2BYTE_KIND:
10224 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010225 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 PyUnicode_GET_LENGTH(string), keepends);
10227 break;
10228 case PyUnicode_4BYTE_KIND:
10229 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010230 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 PyUnicode_GET_LENGTH(string), keepends);
10232 break;
10233 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010234 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237}
10238
Alexander Belopolsky40018472011-02-26 01:02:56 +000010239static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010240split(PyObject *self,
10241 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010242 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010244 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 void *buf1, *buf2;
10246 Py_ssize_t len1, len2;
10247 PyObject* out;
10248
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010250 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (PyUnicode_READY(self) == -1)
10253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010256 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 if (PyUnicode_IS_ASCII(self))
10259 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010260 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010261 PyUnicode_GET_LENGTH(self), maxcount
10262 );
10263 else
10264 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010265 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010266 PyUnicode_GET_LENGTH(self), maxcount
10267 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 case PyUnicode_2BYTE_KIND:
10269 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010270 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 PyUnicode_GET_LENGTH(self), maxcount
10272 );
10273 case PyUnicode_4BYTE_KIND:
10274 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010275 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 PyUnicode_GET_LENGTH(self), maxcount
10277 );
10278 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010279 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 }
10281
10282 if (PyUnicode_READY(substring) == -1)
10283 return NULL;
10284
10285 kind1 = PyUnicode_KIND(self);
10286 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 len1 = PyUnicode_GET_LENGTH(self);
10288 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010289 if (kind1 < kind2 || len1 < len2) {
10290 out = PyList_New(1);
10291 if (out == NULL)
10292 return NULL;
10293 Py_INCREF(self);
10294 PyList_SET_ITEM(out, 0, self);
10295 return out;
10296 }
10297 buf1 = PyUnicode_DATA(self);
10298 buf2 = PyUnicode_DATA(substring);
10299 if (kind2 != kind1) {
10300 buf2 = _PyUnicode_AsKind(substring, kind1);
10301 if (!buf2)
10302 return NULL;
10303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010305 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10308 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010310 else
10311 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010312 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 break;
10314 case PyUnicode_2BYTE_KIND:
10315 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 break;
10318 case PyUnicode_4BYTE_KIND:
10319 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010320 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 break;
10322 default:
10323 out = NULL;
10324 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010325 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 PyMem_Free(buf2);
10327 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328}
10329
Alexander Belopolsky40018472011-02-26 01:02:56 +000010330static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010331rsplit(PyObject *self,
10332 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010333 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010334{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010335 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 void *buf1, *buf2;
10337 Py_ssize_t len1, len2;
10338 PyObject* out;
10339
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010340 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010341 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (PyUnicode_READY(self) == -1)
10344 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010347 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 if (PyUnicode_IS_ASCII(self))
10350 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010351 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010352 PyUnicode_GET_LENGTH(self), maxcount
10353 );
10354 else
10355 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010356 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010357 PyUnicode_GET_LENGTH(self), maxcount
10358 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 case PyUnicode_2BYTE_KIND:
10360 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 PyUnicode_GET_LENGTH(self), maxcount
10363 );
10364 case PyUnicode_4BYTE_KIND:
10365 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010366 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 PyUnicode_GET_LENGTH(self), maxcount
10368 );
10369 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010370 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 }
10372
10373 if (PyUnicode_READY(substring) == -1)
10374 return NULL;
10375
10376 kind1 = PyUnicode_KIND(self);
10377 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 len1 = PyUnicode_GET_LENGTH(self);
10379 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010380 if (kind1 < kind2 || len1 < len2) {
10381 out = PyList_New(1);
10382 if (out == NULL)
10383 return NULL;
10384 Py_INCREF(self);
10385 PyList_SET_ITEM(out, 0, self);
10386 return out;
10387 }
10388 buf1 = PyUnicode_DATA(self);
10389 buf2 = PyUnicode_DATA(substring);
10390 if (kind2 != kind1) {
10391 buf2 = _PyUnicode_AsKind(substring, kind1);
10392 if (!buf2)
10393 return NULL;
10394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010396 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10399 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010401 else
10402 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 break;
10405 case PyUnicode_2BYTE_KIND:
10406 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_4BYTE_KIND:
10410 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 break;
10413 default:
10414 out = NULL;
10415 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010416 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 PyMem_Free(buf2);
10418 return out;
10419}
10420
10421static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10423 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010425 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10428 return asciilib_find(buf1, len1, buf2, len2, offset);
10429 else
10430 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 case PyUnicode_2BYTE_KIND:
10432 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10433 case PyUnicode_4BYTE_KIND:
10434 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10435 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010436 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437}
10438
10439static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010440anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10441 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010443 switch (kind) {
10444 case PyUnicode_1BYTE_KIND:
10445 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10446 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10447 else
10448 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10449 case PyUnicode_2BYTE_KIND:
10450 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10451 case PyUnicode_4BYTE_KIND:
10452 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10453 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010454 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010455}
10456
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010457static void
10458replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10459 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10460{
10461 int kind = PyUnicode_KIND(u);
10462 void *data = PyUnicode_DATA(u);
10463 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10464 if (kind == PyUnicode_1BYTE_KIND) {
10465 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10466 (Py_UCS1 *)data + len,
10467 u1, u2, maxcount);
10468 }
10469 else if (kind == PyUnicode_2BYTE_KIND) {
10470 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10471 (Py_UCS2 *)data + len,
10472 u1, u2, maxcount);
10473 }
10474 else {
10475 assert(kind == PyUnicode_4BYTE_KIND);
10476 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10477 (Py_UCS4 *)data + len,
10478 u1, u2, maxcount);
10479 }
10480}
10481
Alexander Belopolsky40018472011-02-26 01:02:56 +000010482static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483replace(PyObject *self, PyObject *str1,
10484 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 PyObject *u;
10487 char *sbuf = PyUnicode_DATA(self);
10488 char *buf1 = PyUnicode_DATA(str1);
10489 char *buf2 = PyUnicode_DATA(str2);
10490 int srelease = 0, release1 = 0, release2 = 0;
10491 int skind = PyUnicode_KIND(self);
10492 int kind1 = PyUnicode_KIND(str1);
10493 int kind2 = PyUnicode_KIND(str2);
10494 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10495 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10496 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010497 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010498 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499
10500 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010503 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504
Victor Stinner59de0ee2011-10-07 10:01:28 +020010505 if (str1 == str2)
10506 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010509 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10510 if (maxchar < maxchar_str1)
10511 /* substring too wide to be present */
10512 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010513 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10514 /* Replacing str1 with str2 may cause a maxchar reduction in the
10515 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010516 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010517 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010520 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010522 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010525 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010526 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010527
Victor Stinner69ed0f42013-04-09 21:48:24 +020010528 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010529 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010530 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010532 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010536
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010537 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10538 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 }
10540 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 int rkind = skind;
10542 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010543 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 if (kind1 < rkind) {
10546 /* widen substring */
10547 buf1 = _PyUnicode_AsKind(str1, rkind);
10548 if (!buf1) goto error;
10549 release1 = 1;
10550 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010551 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552 if (i < 0)
10553 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (rkind > kind2) {
10555 /* widen replacement */
10556 buf2 = _PyUnicode_AsKind(str2, rkind);
10557 if (!buf2) goto error;
10558 release2 = 1;
10559 }
10560 else if (rkind < kind2) {
10561 /* widen self and buf1 */
10562 rkind = kind2;
10563 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010564 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 sbuf = _PyUnicode_AsKind(self, rkind);
10566 if (!sbuf) goto error;
10567 srelease = 1;
10568 buf1 = _PyUnicode_AsKind(str1, rkind);
10569 if (!buf1) goto error;
10570 release1 = 1;
10571 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 u = PyUnicode_New(slen, maxchar);
10573 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010575 assert(PyUnicode_KIND(u) == rkind);
10576 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010577
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010578 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010579 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010584
10585 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010586 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010587 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010588 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010589 if (i == -1)
10590 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010593 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010597 }
10598 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010600 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 int rkind = skind;
10602 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010605 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 buf1 = _PyUnicode_AsKind(str1, rkind);
10607 if (!buf1) goto error;
10608 release1 = 1;
10609 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010610 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 if (n == 0)
10612 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 buf2 = _PyUnicode_AsKind(str2, rkind);
10616 if (!buf2) goto error;
10617 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 rkind = kind2;
10622 sbuf = _PyUnicode_AsKind(self, rkind);
10623 if (!sbuf) goto error;
10624 srelease = 1;
10625 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010626 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 buf1 = _PyUnicode_AsKind(str1, rkind);
10628 if (!buf1) goto error;
10629 release1 = 1;
10630 }
10631 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10632 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010633 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 PyErr_SetString(PyExc_OverflowError,
10635 "replace string is too long");
10636 goto error;
10637 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010638 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010639 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010640 _Py_INCREF_UNICODE_EMPTY();
10641 if (!unicode_empty)
10642 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 u = unicode_empty;
10644 goto done;
10645 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010646 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 PyErr_SetString(PyExc_OverflowError,
10648 "replace string is too long");
10649 goto error;
10650 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 u = PyUnicode_New(new_size, maxchar);
10652 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010654 assert(PyUnicode_KIND(u) == rkind);
10655 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 ires = i = 0;
10657 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 while (n-- > 0) {
10659 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010660 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010662 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010663 if (j == -1)
10664 break;
10665 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
10668 sbuf + rkind * i,
10669 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 }
10672 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010674 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 memcpy(res + rkind * ires,
10684 sbuf + rkind * i,
10685 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010686 }
10687 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688 /* interleave */
10689 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010690 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010692 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 if (--n <= 0)
10695 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010696 memcpy(res + rkind * ires,
10697 sbuf + rkind * i,
10698 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 ires++;
10700 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010701 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010702 memcpy(res + rkind * ires,
10703 sbuf + rkind * i,
10704 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010706 }
10707
10708 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010709 unicode_adjust_maxchar(&u);
10710 if (u == NULL)
10711 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010713
10714 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 if (srelease)
10716 PyMem_FREE(sbuf);
10717 if (release1)
10718 PyMem_FREE(buf1);
10719 if (release2)
10720 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010721 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010723
Benjamin Peterson29060642009-01-31 22:14:21 +000010724 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010725 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 if (srelease)
10727 PyMem_FREE(sbuf);
10728 if (release1)
10729 PyMem_FREE(buf1);
10730 if (release2)
10731 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010732 return unicode_result_unchanged(self);
10733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 error:
10735 if (srelease && sbuf)
10736 PyMem_FREE(sbuf);
10737 if (release1 && buf1)
10738 PyMem_FREE(buf1);
10739 if (release2 && buf2)
10740 PyMem_FREE(buf2);
10741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742}
10743
10744/* --- Unicode Object Methods --------------------------------------------- */
10745
INADA Naoki3ae20562017-01-16 20:41:20 +090010746/*[clinic input]
10747str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748
INADA Naoki3ae20562017-01-16 20:41:20 +090010749Return a version of the string where each word is titlecased.
10750
10751More specifically, words start with uppercased characters and all remaining
10752cased characters have lower case.
10753[clinic start generated code]*/
10754
10755static PyObject *
10756unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010757/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010759 if (PyUnicode_READY(self) == -1)
10760 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010761 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762}
10763
INADA Naoki3ae20562017-01-16 20:41:20 +090010764/*[clinic input]
10765str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
INADA Naoki3ae20562017-01-16 20:41:20 +090010767Return a capitalized version of the string.
10768
10769More specifically, make the first character have upper case and the rest lower
10770case.
10771[clinic start generated code]*/
10772
10773static PyObject *
10774unicode_capitalize_impl(PyObject *self)
10775/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010777 if (PyUnicode_READY(self) == -1)
10778 return NULL;
10779 if (PyUnicode_GET_LENGTH(self) == 0)
10780 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010781 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782}
10783
INADA Naoki3ae20562017-01-16 20:41:20 +090010784/*[clinic input]
10785str.casefold as unicode_casefold
10786
10787Return a version of the string suitable for caseless comparisons.
10788[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010789
10790static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010791unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010792/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010793{
10794 if (PyUnicode_READY(self) == -1)
10795 return NULL;
10796 if (PyUnicode_IS_ASCII(self))
10797 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010798 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010799}
10800
10801
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010802/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010803
10804static int
10805convert_uc(PyObject *obj, void *addr)
10806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010809 if (!PyUnicode_Check(obj)) {
10810 PyErr_Format(PyExc_TypeError,
10811 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010812 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010813 return 0;
10814 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010815 if (PyUnicode_READY(obj) < 0)
10816 return 0;
10817 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010818 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010820 return 0;
10821 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010822 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010823 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010824}
10825
INADA Naoki3ae20562017-01-16 20:41:20 +090010826/*[clinic input]
10827str.center as unicode_center
10828
10829 width: Py_ssize_t
10830 fillchar: Py_UCS4 = ' '
10831 /
10832
10833Return a centered string of length width.
10834
10835Padding is done using the specified fill character (default is a space).
10836[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837
10838static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010839unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10840/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010842 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
Benjamin Petersonbac79492012-01-14 13:34:47 -050010844 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 return NULL;
10846
Victor Stinnerc4b49542011-12-11 22:44:26 +010010847 if (PyUnicode_GET_LENGTH(self) >= width)
10848 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
Victor Stinnerc4b49542011-12-11 22:44:26 +010010850 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 left = marg / 2 + (marg & width & 1);
10852
Victor Stinner9310abb2011-10-05 00:59:23 +020010853 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854}
10855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856/* This function assumes that str1 and str2 are readied by the caller. */
10857
Marc-André Lemburge5034372000-08-08 08:04:29 +000010858static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010859unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010860{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010861#define COMPARE(TYPE1, TYPE2) \
10862 do { \
10863 TYPE1* p1 = (TYPE1 *)data1; \
10864 TYPE2* p2 = (TYPE2 *)data2; \
10865 TYPE1* end = p1 + len; \
10866 Py_UCS4 c1, c2; \
10867 for (; p1 != end; p1++, p2++) { \
10868 c1 = *p1; \
10869 c2 = *p2; \
10870 if (c1 != c2) \
10871 return (c1 < c2) ? -1 : 1; \
10872 } \
10873 } \
10874 while (0)
10875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 int kind1, kind2;
10877 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010878 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 kind1 = PyUnicode_KIND(str1);
10881 kind2 = PyUnicode_KIND(str2);
10882 data1 = PyUnicode_DATA(str1);
10883 data2 = PyUnicode_DATA(str2);
10884 len1 = PyUnicode_GET_LENGTH(str1);
10885 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010886 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010887
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010888 switch(kind1) {
10889 case PyUnicode_1BYTE_KIND:
10890 {
10891 switch(kind2) {
10892 case PyUnicode_1BYTE_KIND:
10893 {
10894 int cmp = memcmp(data1, data2, len);
10895 /* normalize result of memcmp() into the range [-1; 1] */
10896 if (cmp < 0)
10897 return -1;
10898 if (cmp > 0)
10899 return 1;
10900 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010901 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010902 case PyUnicode_2BYTE_KIND:
10903 COMPARE(Py_UCS1, Py_UCS2);
10904 break;
10905 case PyUnicode_4BYTE_KIND:
10906 COMPARE(Py_UCS1, Py_UCS4);
10907 break;
10908 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010909 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010910 }
10911 break;
10912 }
10913 case PyUnicode_2BYTE_KIND:
10914 {
10915 switch(kind2) {
10916 case PyUnicode_1BYTE_KIND:
10917 COMPARE(Py_UCS2, Py_UCS1);
10918 break;
10919 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010920 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010921 COMPARE(Py_UCS2, Py_UCS2);
10922 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010923 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010924 case PyUnicode_4BYTE_KIND:
10925 COMPARE(Py_UCS2, Py_UCS4);
10926 break;
10927 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010928 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010929 }
10930 break;
10931 }
10932 case PyUnicode_4BYTE_KIND:
10933 {
10934 switch(kind2) {
10935 case PyUnicode_1BYTE_KIND:
10936 COMPARE(Py_UCS4, Py_UCS1);
10937 break;
10938 case PyUnicode_2BYTE_KIND:
10939 COMPARE(Py_UCS4, Py_UCS2);
10940 break;
10941 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010942 {
10943#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10944 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10945 /* normalize result of wmemcmp() into the range [-1; 1] */
10946 if (cmp < 0)
10947 return -1;
10948 if (cmp > 0)
10949 return 1;
10950#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010952#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010953 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010954 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010955 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010956 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010957 }
10958 break;
10959 }
10960 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010961 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010962 }
10963
Victor Stinner770e19e2012-10-04 22:59:45 +020010964 if (len1 == len2)
10965 return 0;
10966 if (len1 < len2)
10967 return -1;
10968 else
10969 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010970
10971#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010972}
10973
Benjamin Peterson621b4302016-09-09 13:54:34 -070010974static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010975unicode_compare_eq(PyObject *str1, PyObject *str2)
10976{
10977 int kind;
10978 void *data1, *data2;
10979 Py_ssize_t len;
10980 int cmp;
10981
Victor Stinnere5567ad2012-10-23 02:48:49 +020010982 len = PyUnicode_GET_LENGTH(str1);
10983 if (PyUnicode_GET_LENGTH(str2) != len)
10984 return 0;
10985 kind = PyUnicode_KIND(str1);
10986 if (PyUnicode_KIND(str2) != kind)
10987 return 0;
10988 data1 = PyUnicode_DATA(str1);
10989 data2 = PyUnicode_DATA(str2);
10990
10991 cmp = memcmp(data1, data2, len * kind);
10992 return (cmp == 0);
10993}
10994
10995
Alexander Belopolsky40018472011-02-26 01:02:56 +000010996int
10997PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11000 if (PyUnicode_READY(left) == -1 ||
11001 PyUnicode_READY(right) == -1)
11002 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011003
11004 /* a string is equal to itself */
11005 if (left == right)
11006 return 0;
11007
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011008 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011010 PyErr_Format(PyExc_TypeError,
11011 "Can't compare %.100s and %.100s",
11012 left->ob_type->tp_name,
11013 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 return -1;
11015}
11016
Martin v. Löwis5b222132007-06-10 09:51:05 +000011017int
11018PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 Py_ssize_t i;
11021 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011023 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024
Victor Stinner910337b2011-10-03 03:20:16 +020011025 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011026 if (!PyUnicode_IS_READY(uni)) {
11027 const wchar_t *ws = _PyUnicode_WSTR(uni);
11028 /* Compare Unicode string and source character set string */
11029 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11030 if (chr != ustr[i])
11031 return (chr < ustr[i]) ? -1 : 1;
11032 }
11033 /* This check keeps Python strings that end in '\0' from comparing equal
11034 to C strings identical up to that point. */
11035 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11036 return 1; /* uni is longer */
11037 if (ustr[i])
11038 return -1; /* str is longer */
11039 return 0;
11040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011042 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011043 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011044 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011045 size_t len, len2 = strlen(str);
11046 int cmp;
11047
11048 len = Py_MIN(len1, len2);
11049 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011050 if (cmp != 0) {
11051 if (cmp < 0)
11052 return -1;
11053 else
11054 return 1;
11055 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011056 if (len1 > len2)
11057 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011058 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011059 return -1; /* str is longer */
11060 return 0;
11061 }
11062 else {
11063 void *data = PyUnicode_DATA(uni);
11064 /* Compare Unicode string and source character set string */
11065 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011066 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011067 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11068 /* This check keeps Python strings that end in '\0' from comparing equal
11069 to C strings identical up to that point. */
11070 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11071 return 1; /* uni is longer */
11072 if (str[i])
11073 return -1; /* str is longer */
11074 return 0;
11075 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011076}
11077
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011078static int
11079non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11080{
11081 size_t i, len;
11082 const wchar_t *p;
11083 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11084 if (strlen(str) != len)
11085 return 0;
11086 p = _PyUnicode_WSTR(unicode);
11087 assert(p);
11088 for (i = 0; i < len; i++) {
11089 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011090 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011091 return 0;
11092 }
11093 return 1;
11094}
11095
11096int
11097_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11098{
11099 size_t len;
11100 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011101 assert(str);
11102#ifndef NDEBUG
11103 for (const char *p = str; *p; p++) {
11104 assert((unsigned char)*p < 128);
11105 }
11106#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011107 if (PyUnicode_READY(unicode) == -1) {
11108 /* Memory error or bad data */
11109 PyErr_Clear();
11110 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11111 }
11112 if (!PyUnicode_IS_ASCII(unicode))
11113 return 0;
11114 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11115 return strlen(str) == len &&
11116 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11117}
11118
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011119int
11120_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11121{
11122 PyObject *right_uni;
11123 Py_hash_t hash;
11124
11125 assert(_PyUnicode_CHECK(left));
11126 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011127#ifndef NDEBUG
11128 for (const char *p = right->string; *p; p++) {
11129 assert((unsigned char)*p < 128);
11130 }
11131#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011132
11133 if (PyUnicode_READY(left) == -1) {
11134 /* memory error or bad data */
11135 PyErr_Clear();
11136 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11137 }
11138
11139 if (!PyUnicode_IS_ASCII(left))
11140 return 0;
11141
11142 right_uni = _PyUnicode_FromId(right); /* borrowed */
11143 if (right_uni == NULL) {
11144 /* memory error or bad data */
11145 PyErr_Clear();
11146 return _PyUnicode_EqualToASCIIString(left, right->string);
11147 }
11148
11149 if (left == right_uni)
11150 return 1;
11151
11152 if (PyUnicode_CHECK_INTERNED(left))
11153 return 0;
11154
INADA Naoki7cc95f52018-01-28 02:07:09 +090011155 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011156 hash = _PyUnicode_HASH(left);
11157 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11158 return 0;
11159
11160 return unicode_compare_eq(left, right_uni);
11161}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011162
Alexander Belopolsky40018472011-02-26 01:02:56 +000011163PyObject *
11164PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011165{
11166 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011167
Victor Stinnere5567ad2012-10-23 02:48:49 +020011168 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11169 Py_RETURN_NOTIMPLEMENTED;
11170
11171 if (PyUnicode_READY(left) == -1 ||
11172 PyUnicode_READY(right) == -1)
11173 return NULL;
11174
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011175 if (left == right) {
11176 switch (op) {
11177 case Py_EQ:
11178 case Py_LE:
11179 case Py_GE:
11180 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011181 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011182 case Py_NE:
11183 case Py_LT:
11184 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011185 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011186 default:
11187 PyErr_BadArgument();
11188 return NULL;
11189 }
11190 }
11191 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011192 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011193 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011194 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011195 }
11196 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011197 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011198 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011199 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011200}
11201
Alexander Belopolsky40018472011-02-26 01:02:56 +000011202int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011203_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11204{
11205 return unicode_eq(aa, bb);
11206}
11207
11208int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011210{
Victor Stinner77282cb2013-04-14 19:22:47 +020011211 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 void *buf1, *buf2;
11213 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011214 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011215
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011218 "'in <string>' requires string as left operand, not %.100s",
11219 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011220 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011221 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011222 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011223 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011224 if (ensure_unicode(str) < 0)
11225 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011227 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011228 kind2 = PyUnicode_KIND(substr);
11229 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011230 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 len2 = PyUnicode_GET_LENGTH(substr);
11233 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011235 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011237 if (len2 == 1) {
11238 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11239 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011240 return result;
11241 }
11242 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 buf2 = _PyUnicode_AsKind(substr, kind1);
11244 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011245 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011246 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247
Victor Stinner77282cb2013-04-14 19:22:47 +020011248 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 case PyUnicode_1BYTE_KIND:
11250 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11251 break;
11252 case PyUnicode_2BYTE_KIND:
11253 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11254 break;
11255 case PyUnicode_4BYTE_KIND:
11256 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11257 break;
11258 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011259 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011261
Victor Stinner77282cb2013-04-14 19:22:47 +020011262 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 PyMem_Free(buf2);
11264
Guido van Rossum403d68b2000-03-13 15:55:09 +000011265 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011266}
11267
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268/* Concat to string or Unicode object giving a new Unicode object. */
11269
Alexander Belopolsky40018472011-02-26 01:02:56 +000011270PyObject *
11271PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011273 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011274 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011275 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011277 if (ensure_unicode(left) < 0)
11278 return NULL;
11279
11280 if (!PyUnicode_Check(right)) {
11281 PyErr_Format(PyExc_TypeError,
11282 "can only concatenate str (not \"%.200s\") to str",
11283 right->ob_type->tp_name);
11284 return NULL;
11285 }
11286 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011287 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
11289 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011290 if (left == unicode_empty)
11291 return PyUnicode_FromObject(right);
11292 if (right == unicode_empty)
11293 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 left_len = PyUnicode_GET_LENGTH(left);
11296 right_len = PyUnicode_GET_LENGTH(right);
11297 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011298 PyErr_SetString(PyExc_OverflowError,
11299 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011300 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011301 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011303
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011304 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11305 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011306 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 result = PyUnicode_New(new_len, maxchar);
11310 if (result == NULL)
11311 return NULL;
11312 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11313 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11314 assert(_PyUnicode_CheckConsistency(result, 1));
11315 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316}
11317
Walter Dörwald1ab83302007-05-18 17:15:44 +000011318void
Victor Stinner23e56682011-10-03 03:54:37 +020011319PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011320{
Victor Stinner23e56682011-10-03 03:54:37 +020011321 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011322 Py_UCS4 maxchar, maxchar2;
11323 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011324
11325 if (p_left == NULL) {
11326 if (!PyErr_Occurred())
11327 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011328 return;
11329 }
Victor Stinner23e56682011-10-03 03:54:37 +020011330 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011331 if (right == NULL || left == NULL
11332 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011333 if (!PyErr_Occurred())
11334 PyErr_BadInternalCall();
11335 goto error;
11336 }
11337
Benjamin Petersonbac79492012-01-14 13:34:47 -050011338 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011339 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011340 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011341 goto error;
11342
Victor Stinner488fa492011-12-12 00:01:39 +010011343 /* Shortcuts */
11344 if (left == unicode_empty) {
11345 Py_DECREF(left);
11346 Py_INCREF(right);
11347 *p_left = right;
11348 return;
11349 }
11350 if (right == unicode_empty)
11351 return;
11352
11353 left_len = PyUnicode_GET_LENGTH(left);
11354 right_len = PyUnicode_GET_LENGTH(right);
11355 if (left_len > PY_SSIZE_T_MAX - right_len) {
11356 PyErr_SetString(PyExc_OverflowError,
11357 "strings are too large to concat");
11358 goto error;
11359 }
11360 new_len = left_len + right_len;
11361
11362 if (unicode_modifiable(left)
11363 && PyUnicode_CheckExact(right)
11364 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011365 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11366 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011367 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011368 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011369 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11370 {
11371 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011372 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011373 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011374
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011375 /* copy 'right' into the newly allocated area of 'left' */
11376 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011377 }
Victor Stinner488fa492011-12-12 00:01:39 +010011378 else {
11379 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11380 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011381 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011382
Victor Stinner488fa492011-12-12 00:01:39 +010011383 /* Concat the two Unicode strings */
11384 res = PyUnicode_New(new_len, maxchar);
11385 if (res == NULL)
11386 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011387 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11388 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011389 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011390 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011391 }
11392 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011393 return;
11394
11395error:
Victor Stinner488fa492011-12-12 00:01:39 +010011396 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011397}
11398
11399void
11400PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11401{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011402 PyUnicode_Append(pleft, right);
11403 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011404}
11405
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011406/*
11407Wraps stringlib_parse_args_finds() and additionally ensures that the
11408first argument is a unicode object.
11409*/
11410
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011411static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011412parse_args_finds_unicode(const char * function_name, PyObject *args,
11413 PyObject **substring,
11414 Py_ssize_t *start, Py_ssize_t *end)
11415{
11416 if(stringlib_parse_args_finds(function_name, args, substring,
11417 start, end)) {
11418 if (ensure_unicode(*substring) < 0)
11419 return 0;
11420 return 1;
11421 }
11422 return 0;
11423}
11424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011428Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011429string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431
11432static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011433unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011435 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011436 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011437 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011439 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 void *buf1, *buf2;
11441 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011443 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 kind1 = PyUnicode_KIND(self);
11447 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011448 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011449 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 len1 = PyUnicode_GET_LENGTH(self);
11452 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011454 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011455 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011456
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011457 buf1 = PyUnicode_DATA(self);
11458 buf2 = PyUnicode_DATA(substring);
11459 if (kind2 != kind1) {
11460 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011461 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011463 }
11464 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 case PyUnicode_1BYTE_KIND:
11466 iresult = ucs1lib_count(
11467 ((Py_UCS1*)buf1) + start, end - start,
11468 buf2, len2, PY_SSIZE_T_MAX
11469 );
11470 break;
11471 case PyUnicode_2BYTE_KIND:
11472 iresult = ucs2lib_count(
11473 ((Py_UCS2*)buf1) + start, end - start,
11474 buf2, len2, PY_SSIZE_T_MAX
11475 );
11476 break;
11477 case PyUnicode_4BYTE_KIND:
11478 iresult = ucs4lib_count(
11479 ((Py_UCS4*)buf1) + start, end - start,
11480 buf2, len2, PY_SSIZE_T_MAX
11481 );
11482 break;
11483 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011484 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 }
11486
11487 result = PyLong_FromSsize_t(iresult);
11488
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011489 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 return result;
11493}
11494
INADA Naoki3ae20562017-01-16 20:41:20 +090011495/*[clinic input]
11496str.encode as unicode_encode
11497
11498 encoding: str(c_default="NULL") = 'utf-8'
11499 The encoding in which to encode the string.
11500 errors: str(c_default="NULL") = 'strict'
11501 The error handling scheme to use for encoding errors.
11502 The default is 'strict' meaning that encoding errors raise a
11503 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11504 'xmlcharrefreplace' as well as any other name registered with
11505 codecs.register_error that can handle UnicodeEncodeErrors.
11506
11507Encode the string using the codec registered for encoding.
11508[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509
11510static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011511unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011512/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011514 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011515}
11516
INADA Naoki3ae20562017-01-16 20:41:20 +090011517/*[clinic input]
11518str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
INADA Naoki3ae20562017-01-16 20:41:20 +090011520 tabsize: int = 8
11521
11522Return a copy where all tab characters are expanded using spaces.
11523
11524If tabsize is not given, a tab size of 8 characters is assumed.
11525[clinic start generated code]*/
11526
11527static PyObject *
11528unicode_expandtabs_impl(PyObject *self, int tabsize)
11529/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011531 Py_ssize_t i, j, line_pos, src_len, incr;
11532 Py_UCS4 ch;
11533 PyObject *u;
11534 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011536 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
Antoine Pitrou22425222011-10-04 19:10:51 +020011538 if (PyUnicode_READY(self) == -1)
11539 return NULL;
11540
Thomas Wouters7e474022000-07-16 12:04:32 +000011541 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011542 src_len = PyUnicode_GET_LENGTH(self);
11543 i = j = line_pos = 0;
11544 kind = PyUnicode_KIND(self);
11545 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011546 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011547 for (; i < src_len; i++) {
11548 ch = PyUnicode_READ(kind, src_data, i);
11549 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011550 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011552 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011554 goto overflow;
11555 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011557 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 goto overflow;
11562 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011564 if (ch == '\n' || ch == '\r')
11565 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011567 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011568 if (!found)
11569 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011570
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 if (!u)
11574 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011575 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
Antoine Pitroue71d5742011-10-04 15:55:09 +020011577 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 for (; i < src_len; i++) {
11580 ch = PyUnicode_READ(kind, src_data, i);
11581 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 incr = tabsize - (line_pos % tabsize);
11584 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011585 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011586 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011588 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011590 line_pos++;
11591 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011592 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 if (ch == '\n' || ch == '\r')
11594 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011596 }
11597 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011598 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011599
Antoine Pitroue71d5742011-10-04 15:55:09 +020011600 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011601 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603}
11604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011606 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607\n\
11608Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011609such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610arguments start and end are interpreted as in slice notation.\n\
11611\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
11614static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011617 /* initialize variables to prevent gcc warning */
11618 PyObject *substring = NULL;
11619 Py_ssize_t start = 0;
11620 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011623 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011626 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011629 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (result == -2)
11632 return NULL;
11633
Christian Heimes217cfd12007-12-02 14:31:20 +000011634 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635}
11636
11637static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011638unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011640 void *data;
11641 enum PyUnicode_Kind kind;
11642 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011643
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011644 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011645 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011647 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011648 if (PyUnicode_READY(self) == -1) {
11649 return NULL;
11650 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011651 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11652 PyErr_SetString(PyExc_IndexError, "string index out of range");
11653 return NULL;
11654 }
11655 kind = PyUnicode_KIND(self);
11656 data = PyUnicode_DATA(self);
11657 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011658 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659}
11660
Guido van Rossumc2504932007-09-18 19:42:40 +000011661/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011662 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011663static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011664unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011666 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011667
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011668#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011669 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011670#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 if (_PyUnicode_HASH(self) != -1)
11672 return _PyUnicode_HASH(self);
11673 if (PyUnicode_READY(self) == -1)
11674 return -1;
animalizea1d14252019-01-02 20:16:06 +080011675
Christian Heimes985ecdc2013-11-20 11:46:18 +010011676 x = _Py_HashBytes(PyUnicode_DATA(self),
11677 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011679 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680}
11681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011682PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684\n\
oldkaa0735f2018-02-02 16:52:55 +080011685Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011686such that sub is contained within S[start:end]. Optional\n\
11687arguments start and end are interpreted as in slice notation.\n\
11688\n\
11689Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690
11691static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011694 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011695 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011696 PyObject *substring = NULL;
11697 Py_ssize_t start = 0;
11698 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011700 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011703 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011706 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 if (result == -2)
11709 return NULL;
11710
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711 if (result < 0) {
11712 PyErr_SetString(PyExc_ValueError, "substring not found");
11713 return NULL;
11714 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011715
Christian Heimes217cfd12007-12-02 14:31:20 +000011716 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717}
11718
INADA Naoki3ae20562017-01-16 20:41:20 +090011719/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011720str.isascii as unicode_isascii
11721
11722Return True if all characters in the string are ASCII, False otherwise.
11723
11724ASCII characters have code points in the range U+0000-U+007F.
11725Empty string is ASCII too.
11726[clinic start generated code]*/
11727
11728static PyObject *
11729unicode_isascii_impl(PyObject *self)
11730/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11731{
11732 if (PyUnicode_READY(self) == -1) {
11733 return NULL;
11734 }
11735 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11736}
11737
11738/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011739str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740
INADA Naoki3ae20562017-01-16 20:41:20 +090011741Return True if the string is a lowercase string, False otherwise.
11742
11743A string is lowercase if all cased characters in the string are lowercase and
11744there is at least one cased character in the string.
11745[clinic start generated code]*/
11746
11747static PyObject *
11748unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011749/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 Py_ssize_t i, length;
11752 int kind;
11753 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 int cased;
11755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (PyUnicode_READY(self) == -1)
11757 return NULL;
11758 length = PyUnicode_GET_LENGTH(self);
11759 kind = PyUnicode_KIND(self);
11760 data = PyUnicode_DATA(self);
11761
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (length == 1)
11764 return PyBool_FromLong(
11765 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011767 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011769 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011770
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 for (i = 0; i < length; i++) {
11773 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011774
Benjamin Peterson29060642009-01-31 22:14:21 +000011775 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011776 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 else if (!cased && Py_UNICODE_ISLOWER(ch))
11778 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011780 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781}
11782
INADA Naoki3ae20562017-01-16 20:41:20 +090011783/*[clinic input]
11784str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
INADA Naoki3ae20562017-01-16 20:41:20 +090011786Return True if the string is an uppercase string, False otherwise.
11787
11788A string is uppercase if all cased characters in the string are uppercase and
11789there is at least one cased character in the string.
11790[clinic start generated code]*/
11791
11792static PyObject *
11793unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011794/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 Py_ssize_t i, length;
11797 int kind;
11798 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 int cased;
11800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (PyUnicode_READY(self) == -1)
11802 return NULL;
11803 length = PyUnicode_GET_LENGTH(self);
11804 kind = PyUnicode_KIND(self);
11805 data = PyUnicode_DATA(self);
11806
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 if (length == 1)
11809 return PyBool_FromLong(
11810 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011812 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011814 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011815
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 for (i = 0; i < length; i++) {
11818 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011819
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011821 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011822 else if (!cased && Py_UNICODE_ISUPPER(ch))
11823 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011825 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826}
11827
INADA Naoki3ae20562017-01-16 20:41:20 +090011828/*[clinic input]
11829str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
INADA Naoki3ae20562017-01-16 20:41:20 +090011831Return True if the string is a title-cased string, False otherwise.
11832
11833In a title-cased string, upper- and title-case characters may only
11834follow uncased characters and lowercase characters only cased ones.
11835[clinic start generated code]*/
11836
11837static PyObject *
11838unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011839/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 Py_ssize_t i, length;
11842 int kind;
11843 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 int cased, previous_is_cased;
11845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 if (PyUnicode_READY(self) == -1)
11847 return NULL;
11848 length = PyUnicode_GET_LENGTH(self);
11849 kind = PyUnicode_KIND(self);
11850 data = PyUnicode_DATA(self);
11851
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (length == 1) {
11854 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11855 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11856 (Py_UNICODE_ISUPPER(ch) != 0));
11857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011859 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011861 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011862
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 cased = 0;
11864 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 for (i = 0; i < length; i++) {
11866 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011867
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11869 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011870 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 previous_is_cased = 1;
11872 cased = 1;
11873 }
11874 else if (Py_UNICODE_ISLOWER(ch)) {
11875 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011876 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 previous_is_cased = 1;
11878 cased = 1;
11879 }
11880 else
11881 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011883 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884}
11885
INADA Naoki3ae20562017-01-16 20:41:20 +090011886/*[clinic input]
11887str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
INADA Naoki3ae20562017-01-16 20:41:20 +090011889Return True if the string is a whitespace string, False otherwise.
11890
11891A string is whitespace if all characters in the string are whitespace and there
11892is at least one character in the string.
11893[clinic start generated code]*/
11894
11895static PyObject *
11896unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011897/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 Py_ssize_t i, length;
11900 int kind;
11901 void *data;
11902
11903 if (PyUnicode_READY(self) == -1)
11904 return NULL;
11905 length = PyUnicode_GET_LENGTH(self);
11906 kind = PyUnicode_KIND(self);
11907 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 if (length == 1)
11911 return PyBool_FromLong(
11912 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011914 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011916 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 for (i = 0; i < length; i++) {
11919 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011920 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011921 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011923 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924}
11925
INADA Naoki3ae20562017-01-16 20:41:20 +090011926/*[clinic input]
11927str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011928
INADA Naoki3ae20562017-01-16 20:41:20 +090011929Return True if the string is an alphabetic string, False otherwise.
11930
11931A string is alphabetic if all characters in the string are alphabetic and there
11932is at least one character in the string.
11933[clinic start generated code]*/
11934
11935static PyObject *
11936unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011937/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 Py_ssize_t i, length;
11940 int kind;
11941 void *data;
11942
11943 if (PyUnicode_READY(self) == -1)
11944 return NULL;
11945 length = PyUnicode_GET_LENGTH(self);
11946 kind = PyUnicode_KIND(self);
11947 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011948
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 if (length == 1)
11951 return PyBool_FromLong(
11952 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011953
11954 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011956 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 for (i = 0; i < length; i++) {
11959 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011960 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011962 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011963}
11964
INADA Naoki3ae20562017-01-16 20:41:20 +090011965/*[clinic input]
11966str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967
INADA Naoki3ae20562017-01-16 20:41:20 +090011968Return True if the string is an alpha-numeric string, False otherwise.
11969
11970A string is alpha-numeric if all characters in the string are alpha-numeric and
11971there is at least one character in the string.
11972[clinic start generated code]*/
11973
11974static PyObject *
11975unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011976/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011977{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 int kind;
11979 void *data;
11980 Py_ssize_t len, i;
11981
11982 if (PyUnicode_READY(self) == -1)
11983 return NULL;
11984
11985 kind = PyUnicode_KIND(self);
11986 data = PyUnicode_DATA(self);
11987 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011988
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011989 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (len == 1) {
11991 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11992 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11993 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011994
11995 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011997 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 for (i = 0; i < len; i++) {
12000 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012001 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012002 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012003 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012004 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012005}
12006
INADA Naoki3ae20562017-01-16 20:41:20 +090012007/*[clinic input]
12008str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009
INADA Naoki3ae20562017-01-16 20:41:20 +090012010Return True if the string is a decimal string, False otherwise.
12011
12012A string is a decimal string if all characters in the string are decimal and
12013there is at least one character in the string.
12014[clinic start generated code]*/
12015
12016static PyObject *
12017unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012018/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 Py_ssize_t i, length;
12021 int kind;
12022 void *data;
12023
12024 if (PyUnicode_READY(self) == -1)
12025 return NULL;
12026 length = PyUnicode_GET_LENGTH(self);
12027 kind = PyUnicode_KIND(self);
12028 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if (length == 1)
12032 return PyBool_FromLong(
12033 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012035 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012037 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 for (i = 0; i < length; i++) {
12040 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012041 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012043 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044}
12045
INADA Naoki3ae20562017-01-16 20:41:20 +090012046/*[clinic input]
12047str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
INADA Naoki3ae20562017-01-16 20:41:20 +090012049Return True if the string is a digit string, False otherwise.
12050
12051A string is a digit string if all characters in the string are digits and there
12052is at least one character in the string.
12053[clinic start generated code]*/
12054
12055static PyObject *
12056unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012057/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 Py_ssize_t i, length;
12060 int kind;
12061 void *data;
12062
12063 if (PyUnicode_READY(self) == -1)
12064 return NULL;
12065 length = PyUnicode_GET_LENGTH(self);
12066 kind = PyUnicode_KIND(self);
12067 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 if (length == 1) {
12071 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12072 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012075 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012077 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 for (i = 0; i < length; i++) {
12080 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012081 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012083 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084}
12085
INADA Naoki3ae20562017-01-16 20:41:20 +090012086/*[clinic input]
12087str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
INADA Naoki3ae20562017-01-16 20:41:20 +090012089Return True if the string is a numeric string, False otherwise.
12090
12091A string is numeric if all characters in the string are numeric and there is at
12092least one character in the string.
12093[clinic start generated code]*/
12094
12095static PyObject *
12096unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012097/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 Py_ssize_t i, length;
12100 int kind;
12101 void *data;
12102
12103 if (PyUnicode_READY(self) == -1)
12104 return NULL;
12105 length = PyUnicode_GET_LENGTH(self);
12106 kind = PyUnicode_KIND(self);
12107 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (length == 1)
12111 return PyBool_FromLong(
12112 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012114 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 for (i = 0; i < length; i++) {
12119 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012120 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012122 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123}
12124
Martin v. Löwis47383402007-08-15 07:32:56 +000012125int
12126PyUnicode_IsIdentifier(PyObject *self)
12127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 int kind;
12129 void *data;
12130 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012131 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 if (PyUnicode_READY(self) == -1) {
12134 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 }
12137
12138 /* Special case for empty strings */
12139 if (PyUnicode_GET_LENGTH(self) == 0)
12140 return 0;
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012143
12144 /* PEP 3131 says that the first character must be in
12145 XID_Start and subsequent characters in XID_Continue,
12146 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012148 letters, digits, underscore). However, given the current
12149 definition of XID_Start and XID_Continue, it is sufficient
12150 to check just for these, except that _ must be allowed
12151 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012153 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012154 return 0;
12155
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012156 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012158 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012159 return 1;
12160}
12161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162/*[clinic input]
12163str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012164
INADA Naoki3ae20562017-01-16 20:41:20 +090012165Return True if the string is a valid Python identifier, False otherwise.
12166
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012167Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012168such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012169[clinic start generated code]*/
12170
12171static PyObject *
12172unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012173/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012174{
12175 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12176}
12177
INADA Naoki3ae20562017-01-16 20:41:20 +090012178/*[clinic input]
12179str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012180
INADA Naoki3ae20562017-01-16 20:41:20 +090012181Return True if the string is printable, False otherwise.
12182
12183A string is printable if all of its characters are considered printable in
12184repr() or if it is empty.
12185[clinic start generated code]*/
12186
12187static PyObject *
12188unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012189/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 Py_ssize_t i, length;
12192 int kind;
12193 void *data;
12194
12195 if (PyUnicode_READY(self) == -1)
12196 return NULL;
12197 length = PyUnicode_GET_LENGTH(self);
12198 kind = PyUnicode_KIND(self);
12199 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012200
12201 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (length == 1)
12203 return PyBool_FromLong(
12204 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 for (i = 0; i < length; i++) {
12207 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012208 Py_RETURN_FALSE;
12209 }
12210 }
12211 Py_RETURN_TRUE;
12212}
12213
INADA Naoki3ae20562017-01-16 20:41:20 +090012214/*[clinic input]
12215str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
INADA Naoki3ae20562017-01-16 20:41:20 +090012217 iterable: object
12218 /
12219
12220Concatenate any number of strings.
12221
Martin Panter91a88662017-01-24 00:30:06 +000012222The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012223The result is returned as a new string.
12224
12225Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12226[clinic start generated code]*/
12227
12228static PyObject *
12229unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012230/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231{
INADA Naoki3ae20562017-01-16 20:41:20 +090012232 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
Martin v. Löwis18e16552006-02-15 17:27:45 +000012235static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012236unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (PyUnicode_READY(self) == -1)
12239 return -1;
12240 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241}
12242
INADA Naoki3ae20562017-01-16 20:41:20 +090012243/*[clinic input]
12244str.ljust as unicode_ljust
12245
12246 width: Py_ssize_t
12247 fillchar: Py_UCS4 = ' '
12248 /
12249
12250Return a left-justified string of length width.
12251
12252Padding is done using the specified fill character (default is a space).
12253[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
12255static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012256unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12257/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012259 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
Victor Stinnerc4b49542011-12-11 22:44:26 +010012262 if (PyUnicode_GET_LENGTH(self) >= width)
12263 return unicode_result_unchanged(self);
12264
12265 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266}
12267
INADA Naoki3ae20562017-01-16 20:41:20 +090012268/*[clinic input]
12269str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270
INADA Naoki3ae20562017-01-16 20:41:20 +090012271Return a copy of the string converted to lowercase.
12272[clinic start generated code]*/
12273
12274static PyObject *
12275unicode_lower_impl(PyObject *self)
12276/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012278 if (PyUnicode_READY(self) == -1)
12279 return NULL;
12280 if (PyUnicode_IS_ASCII(self))
12281 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012282 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283}
12284
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012285#define LEFTSTRIP 0
12286#define RIGHTSTRIP 1
12287#define BOTHSTRIP 2
12288
12289/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012290static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291
INADA Naoki3ae20562017-01-16 20:41:20 +090012292#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012294/* externally visible for str.strip(unicode) */
12295PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012296_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 void *data;
12299 int kind;
12300 Py_ssize_t i, j, len;
12301 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012302 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12305 return NULL;
12306
12307 kind = PyUnicode_KIND(self);
12308 data = PyUnicode_DATA(self);
12309 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012310 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12312 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012313 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012314
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315 i = 0;
12316 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012317 while (i < len) {
12318 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12319 if (!BLOOM(sepmask, ch))
12320 break;
12321 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12322 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 i++;
12324 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012325 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012326
Benjamin Peterson14339b62009-01-31 16:36:08 +000012327 j = len;
12328 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012329 j--;
12330 while (j >= i) {
12331 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12332 if (!BLOOM(sepmask, ch))
12333 break;
12334 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12335 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012337 }
12338
Benjamin Peterson29060642009-01-31 22:14:21 +000012339 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012340 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012341
Victor Stinner7931d9a2011-11-04 00:22:48 +010012342 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343}
12344
12345PyObject*
12346PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12347{
12348 unsigned char *data;
12349 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012350 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351
Victor Stinnerde636f32011-10-01 03:55:54 +020012352 if (PyUnicode_READY(self) == -1)
12353 return NULL;
12354
Victor Stinner684d5fd2012-05-03 02:32:34 +020012355 length = PyUnicode_GET_LENGTH(self);
12356 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012357
Victor Stinner684d5fd2012-05-03 02:32:34 +020012358 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012359 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360
Victor Stinnerde636f32011-10-01 03:55:54 +020012361 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012362 PyErr_SetString(PyExc_IndexError, "string index out of range");
12363 return NULL;
12364 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012365 if (start >= length || end < start)
12366 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012367
Victor Stinner684d5fd2012-05-03 02:32:34 +020012368 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012369 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012370 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012371 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012372 }
12373 else {
12374 kind = PyUnicode_KIND(self);
12375 data = PyUnicode_1BYTE_DATA(self);
12376 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012377 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012378 length);
12379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381
12382static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012383do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 Py_ssize_t len, i, j;
12386
12387 if (PyUnicode_READY(self) == -1)
12388 return NULL;
12389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012391
Victor Stinnercc7af722013-04-09 22:39:24 +020012392 if (PyUnicode_IS_ASCII(self)) {
12393 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12394
12395 i = 0;
12396 if (striptype != RIGHTSTRIP) {
12397 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012398 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012399 if (!_Py_ascii_whitespace[ch])
12400 break;
12401 i++;
12402 }
12403 }
12404
12405 j = len;
12406 if (striptype != LEFTSTRIP) {
12407 j--;
12408 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012409 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012410 if (!_Py_ascii_whitespace[ch])
12411 break;
12412 j--;
12413 }
12414 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 }
12416 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012417 else {
12418 int kind = PyUnicode_KIND(self);
12419 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012420
Victor Stinnercc7af722013-04-09 22:39:24 +020012421 i = 0;
12422 if (striptype != RIGHTSTRIP) {
12423 while (i < len) {
12424 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12425 if (!Py_UNICODE_ISSPACE(ch))
12426 break;
12427 i++;
12428 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012429 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012430
12431 j = len;
12432 if (striptype != LEFTSTRIP) {
12433 j--;
12434 while (j >= i) {
12435 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12436 if (!Py_UNICODE_ISSPACE(ch))
12437 break;
12438 j--;
12439 }
12440 j++;
12441 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012442 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443
Victor Stinner7931d9a2011-11-04 00:22:48 +010012444 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445}
12446
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012447
12448static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012449do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012450{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012451 if (sep != NULL && sep != Py_None) {
12452 if (PyUnicode_Check(sep))
12453 return _PyUnicode_XStrip(self, striptype, sep);
12454 else {
12455 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012456 "%s arg must be None or str",
12457 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012458 return NULL;
12459 }
12460 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012461
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012463}
12464
12465
INADA Naoki3ae20562017-01-16 20:41:20 +090012466/*[clinic input]
12467str.strip as unicode_strip
12468
12469 chars: object = None
12470 /
12471
Victor Stinner0c4a8282017-01-17 02:21:47 +010012472Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012473
12474If chars is given and not None, remove characters in chars instead.
12475[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012476
12477static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012478unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012479/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012480{
INADA Naoki3ae20562017-01-16 20:41:20 +090012481 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012482}
12483
12484
INADA Naoki3ae20562017-01-16 20:41:20 +090012485/*[clinic input]
12486str.lstrip as unicode_lstrip
12487
12488 chars: object = NULL
12489 /
12490
12491Return a copy of the string with leading whitespace removed.
12492
12493If chars is given and not None, remove characters in chars instead.
12494[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012495
12496static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012497unicode_lstrip_impl(PyObject *self, PyObject *chars)
12498/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012499{
INADA Naoki3ae20562017-01-16 20:41:20 +090012500 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012501}
12502
12503
INADA Naoki3ae20562017-01-16 20:41:20 +090012504/*[clinic input]
12505str.rstrip as unicode_rstrip
12506
12507 chars: object = NULL
12508 /
12509
12510Return a copy of the string with trailing whitespace removed.
12511
12512If chars is given and not None, remove characters in chars instead.
12513[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514
12515static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012516unicode_rstrip_impl(PyObject *self, PyObject *chars)
12517/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012518{
INADA Naoki3ae20562017-01-16 20:41:20 +090012519 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012520}
12521
12522
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Serhiy Storchaka05997252013-01-26 12:14:02 +020012529 if (len < 1)
12530 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531
Victor Stinnerc4b49542011-12-11 22:44:26 +010012532 /* no repeat, return original string */
12533 if (len == 1)
12534 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012535
Benjamin Petersonbac79492012-01-14 13:34:47 -050012536 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 return NULL;
12538
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012539 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012540 PyErr_SetString(PyExc_OverflowError,
12541 "repeated string is too long");
12542 return NULL;
12543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012545
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012546 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547 if (!u)
12548 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012549 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 if (PyUnicode_GET_LENGTH(str) == 1) {
12552 const int kind = PyUnicode_KIND(str);
12553 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012554 if (kind == PyUnicode_1BYTE_KIND) {
12555 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012556 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012557 }
12558 else if (kind == PyUnicode_2BYTE_KIND) {
12559 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012560 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012561 ucs2[n] = fill_char;
12562 } else {
12563 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12564 assert(kind == PyUnicode_4BYTE_KIND);
12565 for (n = 0; n < len; ++n)
12566 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 }
12569 else {
12570 /* number of characters copied this far */
12571 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012572 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012574 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012578 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012579 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581 }
12582
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012583 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012584 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585}
12586
Alexander Belopolsky40018472011-02-26 01:02:56 +000012587PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012588PyUnicode_Replace(PyObject *str,
12589 PyObject *substr,
12590 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012591 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012593 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12594 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012596 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597}
12598
INADA Naoki3ae20562017-01-16 20:41:20 +090012599/*[clinic input]
12600str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601
INADA Naoki3ae20562017-01-16 20:41:20 +090012602 old: unicode
12603 new: unicode
12604 count: Py_ssize_t = -1
12605 Maximum number of occurrences to replace.
12606 -1 (the default value) means replace all occurrences.
12607 /
12608
12609Return a copy with all occurrences of substring old replaced by new.
12610
12611If the optional argument count is given, only the first count occurrences are
12612replaced.
12613[clinic start generated code]*/
12614
12615static PyObject *
12616unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12617 Py_ssize_t count)
12618/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012620 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012622 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623}
12624
Alexander Belopolsky40018472011-02-26 01:02:56 +000012625static PyObject *
12626unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012628 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 Py_ssize_t isize;
12630 Py_ssize_t osize, squote, dquote, i, o;
12631 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012632 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012636 return NULL;
12637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 isize = PyUnicode_GET_LENGTH(unicode);
12639 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 /* Compute length of output, quote characters, and
12642 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012643 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 max = 127;
12645 squote = dquote = 0;
12646 ikind = PyUnicode_KIND(unicode);
12647 for (i = 0; i < isize; i++) {
12648 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012649 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 case '\'': squote++; break;
12652 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012654 incr = 2;
12655 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 default:
12657 /* Fast-path ASCII */
12658 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012659 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 ;
12662 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012665 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012667 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012669 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 if (osize > PY_SSIZE_T_MAX - incr) {
12672 PyErr_SetString(PyExc_OverflowError,
12673 "string is too long to generate repr");
12674 return NULL;
12675 }
12676 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 }
12678
12679 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012680 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012682 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 if (dquote)
12684 /* Both squote and dquote present. Use squote,
12685 and escape them */
12686 osize += squote;
12687 else
12688 quote = '"';
12689 }
Victor Stinner55c08782013-04-14 18:45:39 +020012690 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691
12692 repr = PyUnicode_New(osize, max);
12693 if (repr == NULL)
12694 return NULL;
12695 okind = PyUnicode_KIND(repr);
12696 odata = PyUnicode_DATA(repr);
12697
12698 PyUnicode_WRITE(okind, odata, 0, quote);
12699 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012700 if (unchanged) {
12701 _PyUnicode_FastCopyCharacters(repr, 1,
12702 unicode, 0,
12703 isize);
12704 }
12705 else {
12706 for (i = 0, o = 1; i < isize; i++) {
12707 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708
Victor Stinner55c08782013-04-14 18:45:39 +020012709 /* Escape quotes and backslashes */
12710 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012711 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012713 continue;
12714 }
12715
12716 /* Map special whitespace to '\t', \n', '\r' */
12717 if (ch == '\t') {
12718 PyUnicode_WRITE(okind, odata, o++, '\\');
12719 PyUnicode_WRITE(okind, odata, o++, 't');
12720 }
12721 else if (ch == '\n') {
12722 PyUnicode_WRITE(okind, odata, o++, '\\');
12723 PyUnicode_WRITE(okind, odata, o++, 'n');
12724 }
12725 else if (ch == '\r') {
12726 PyUnicode_WRITE(okind, odata, o++, '\\');
12727 PyUnicode_WRITE(okind, odata, o++, 'r');
12728 }
12729
12730 /* Map non-printable US ASCII to '\xhh' */
12731 else if (ch < ' ' || ch == 0x7F) {
12732 PyUnicode_WRITE(okind, odata, o++, '\\');
12733 PyUnicode_WRITE(okind, odata, o++, 'x');
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12735 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12736 }
12737
12738 /* Copy ASCII characters as-is */
12739 else if (ch < 0x7F) {
12740 PyUnicode_WRITE(okind, odata, o++, ch);
12741 }
12742
12743 /* Non-ASCII characters */
12744 else {
12745 /* Map Unicode whitespace and control characters
12746 (categories Z* and C* except ASCII space)
12747 */
12748 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12749 PyUnicode_WRITE(okind, odata, o++, '\\');
12750 /* Map 8-bit characters to '\xhh' */
12751 if (ch <= 0xff) {
12752 PyUnicode_WRITE(okind, odata, o++, 'x');
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12755 }
12756 /* Map 16-bit characters to '\uxxxx' */
12757 else if (ch <= 0xffff) {
12758 PyUnicode_WRITE(okind, odata, o++, 'u');
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12763 }
12764 /* Map 21-bit characters to '\U00xxxxxx' */
12765 else {
12766 PyUnicode_WRITE(okind, odata, o++, 'U');
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12775 }
12776 }
12777 /* Copy characters as-is */
12778 else {
12779 PyUnicode_WRITE(okind, odata, o++, ch);
12780 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012781 }
12782 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012785 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012786 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787}
12788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012789PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012790 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791\n\
12792Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012793such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794arguments start and end are interpreted as in slice notation.\n\
12795\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012796Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797
12798static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012801 /* initialize variables to prevent gcc warning */
12802 PyObject *substring = NULL;
12803 Py_ssize_t start = 0;
12804 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012807 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012810 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012813 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 if (result == -2)
12816 return NULL;
12817
Christian Heimes217cfd12007-12-02 14:31:20 +000012818 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819}
12820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012821PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012824Return the highest index in S where substring sub is found,\n\
12825such that sub is contained within S[start:end]. Optional\n\
12826arguments start and end are interpreted as in slice notation.\n\
12827\n\
12828Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
12830static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012833 /* initialize variables to prevent gcc warning */
12834 PyObject *substring = NULL;
12835 Py_ssize_t start = 0;
12836 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012837 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012839 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012841
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012842 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012845 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 if (result == -2)
12848 return NULL;
12849
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850 if (result < 0) {
12851 PyErr_SetString(PyExc_ValueError, "substring not found");
12852 return NULL;
12853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854
Christian Heimes217cfd12007-12-02 14:31:20 +000012855 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856}
12857
INADA Naoki3ae20562017-01-16 20:41:20 +090012858/*[clinic input]
12859str.rjust as unicode_rjust
12860
12861 width: Py_ssize_t
12862 fillchar: Py_UCS4 = ' '
12863 /
12864
12865Return a right-justified string of length width.
12866
12867Padding is done using the specified fill character (default is a space).
12868[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869
12870static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012871unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12872/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012874 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875 return NULL;
12876
Victor Stinnerc4b49542011-12-11 22:44:26 +010012877 if (PyUnicode_GET_LENGTH(self) >= width)
12878 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
Victor Stinnerc4b49542011-12-11 22:44:26 +010012880 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881}
12882
Alexander Belopolsky40018472011-02-26 01:02:56 +000012883PyObject *
12884PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012886 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012889 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890}
12891
INADA Naoki3ae20562017-01-16 20:41:20 +090012892/*[clinic input]
12893str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894
INADA Naoki3ae20562017-01-16 20:41:20 +090012895 sep: object = None
12896 The delimiter according which to split the string.
12897 None (the default value) means split according to any whitespace,
12898 and discard empty strings from the result.
12899 maxsplit: Py_ssize_t = -1
12900 Maximum number of splits to do.
12901 -1 (the default value) means no limit.
12902
12903Return a list of the words in the string, using sep as the delimiter string.
12904[clinic start generated code]*/
12905
12906static PyObject *
12907unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12908/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909{
INADA Naoki3ae20562017-01-16 20:41:20 +090012910 if (sep == Py_None)
12911 return split(self, NULL, maxsplit);
12912 if (PyUnicode_Check(sep))
12913 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012914
Victor Stinner998b8062018-09-12 00:23:25 +020012915 PyErr_Format(PyExc_TypeError,
12916 "must be str or None, not %.100s",
12917 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012919}
12920
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012922PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012925 int kind1, kind2;
12926 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012930 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012931
Victor Stinner14f8f022011-10-05 20:58:25 +020012932 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 len1 = PyUnicode_GET_LENGTH(str_obj);
12935 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012936 if (kind1 < kind2 || len1 < len2) {
12937 _Py_INCREF_UNICODE_EMPTY();
12938 if (!unicode_empty)
12939 out = NULL;
12940 else {
12941 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12942 Py_DECREF(unicode_empty);
12943 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012944 return out;
12945 }
12946 buf1 = PyUnicode_DATA(str_obj);
12947 buf2 = PyUnicode_DATA(sep_obj);
12948 if (kind2 != kind1) {
12949 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12950 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012951 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012954 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012956 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12957 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12958 else
12959 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 break;
12961 case PyUnicode_2BYTE_KIND:
12962 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12963 break;
12964 case PyUnicode_4BYTE_KIND:
12965 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966 break;
12967 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012968 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012971 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973
12974 return out;
12975}
12976
12977
12978PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012979PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012982 int kind1, kind2;
12983 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 len1 = PyUnicode_GET_LENGTH(str_obj);
12992 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012993 if (kind1 < kind2 || len1 < len2) {
12994 _Py_INCREF_UNICODE_EMPTY();
12995 if (!unicode_empty)
12996 out = NULL;
12997 else {
12998 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12999 Py_DECREF(unicode_empty);
13000 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013001 return out;
13002 }
13003 buf1 = PyUnicode_DATA(str_obj);
13004 buf2 = PyUnicode_DATA(sep_obj);
13005 if (kind2 != kind1) {
13006 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13007 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013008 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013011 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013013 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13014 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13015 else
13016 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 break;
13018 case PyUnicode_2BYTE_KIND:
13019 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13020 break;
13021 case PyUnicode_4BYTE_KIND:
13022 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13023 break;
13024 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013025 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013027
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013028 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013030
13031 return out;
13032}
13033
INADA Naoki3ae20562017-01-16 20:41:20 +090013034/*[clinic input]
13035str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013036
INADA Naoki3ae20562017-01-16 20:41:20 +090013037 sep: object
13038 /
13039
13040Partition the string into three parts using the given separator.
13041
13042This will search for the separator in the string. If the separator is found,
13043returns a 3-tuple containing the part before the separator, the separator
13044itself, and the part after it.
13045
13046If the separator is not found, returns a 3-tuple containing the original string
13047and two empty strings.
13048[clinic start generated code]*/
13049
13050static PyObject *
13051unicode_partition(PyObject *self, PyObject *sep)
13052/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053{
INADA Naoki3ae20562017-01-16 20:41:20 +090013054 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055}
13056
INADA Naoki3ae20562017-01-16 20:41:20 +090013057/*[clinic input]
13058str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013059
INADA Naoki3ae20562017-01-16 20:41:20 +090013060Partition the string into three parts using the given separator.
13061
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013062This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013063the separator is found, returns a 3-tuple containing the part before the
13064separator, the separator itself, and the part after it.
13065
13066If the separator is not found, returns a 3-tuple containing two empty strings
13067and the original string.
13068[clinic start generated code]*/
13069
13070static PyObject *
13071unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013072/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013073{
INADA Naoki3ae20562017-01-16 20:41:20 +090013074 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013075}
13076
Alexander Belopolsky40018472011-02-26 01:02:56 +000013077PyObject *
13078PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013079{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013080 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013081 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013082
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013083 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013084}
13085
INADA Naoki3ae20562017-01-16 20:41:20 +090013086/*[clinic input]
13087str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013088
INADA Naoki3ae20562017-01-16 20:41:20 +090013089Return a list of the words in the string, using sep as the delimiter string.
13090
13091Splits are done starting at the end of the string and working to the front.
13092[clinic start generated code]*/
13093
13094static PyObject *
13095unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13096/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013097{
INADA Naoki3ae20562017-01-16 20:41:20 +090013098 if (sep == Py_None)
13099 return rsplit(self, NULL, maxsplit);
13100 if (PyUnicode_Check(sep))
13101 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013102
Victor Stinner998b8062018-09-12 00:23:25 +020013103 PyErr_Format(PyExc_TypeError,
13104 "must be str or None, not %.100s",
13105 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013106 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013107}
13108
INADA Naoki3ae20562017-01-16 20:41:20 +090013109/*[clinic input]
13110str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013112 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013113
13114Return a list of the lines in the string, breaking at line boundaries.
13115
13116Line breaks are not included in the resulting list unless keepends is given and
13117true.
13118[clinic start generated code]*/
13119
13120static PyObject *
13121unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013122/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013124 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125}
13126
13127static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013128PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013130 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131}
13132
INADA Naoki3ae20562017-01-16 20:41:20 +090013133/*[clinic input]
13134str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
INADA Naoki3ae20562017-01-16 20:41:20 +090013136Convert uppercase characters to lowercase and lowercase characters to uppercase.
13137[clinic start generated code]*/
13138
13139static PyObject *
13140unicode_swapcase_impl(PyObject *self)
13141/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013143 if (PyUnicode_READY(self) == -1)
13144 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013145 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146}
13147
Larry Hastings61272b72014-01-07 12:41:53 -080013148/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013149
Larry Hastings31826802013-10-19 00:09:25 -070013150@staticmethod
13151str.maketrans as unicode_maketrans
13152
13153 x: object
13154
13155 y: unicode=NULL
13156
13157 z: unicode=NULL
13158
13159 /
13160
13161Return a translation table usable for str.translate().
13162
13163If there is only one argument, it must be a dictionary mapping Unicode
13164ordinals (integers) or characters to Unicode ordinals, strings or None.
13165Character keys will be then converted to ordinals.
13166If there are two arguments, they must be strings of equal length, and
13167in the resulting dictionary, each character in x will be mapped to the
13168character at the same position in y. If there is a third argument, it
13169must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013170[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013171
Larry Hastings31826802013-10-19 00:09:25 -070013172static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013173unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013174/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013175{
Georg Brandlceee0772007-11-27 23:48:05 +000013176 PyObject *new = NULL, *key, *value;
13177 Py_ssize_t i = 0;
13178 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013179
Georg Brandlceee0772007-11-27 23:48:05 +000013180 new = PyDict_New();
13181 if (!new)
13182 return NULL;
13183 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013184 int x_kind, y_kind, z_kind;
13185 void *x_data, *y_data, *z_data;
13186
Georg Brandlceee0772007-11-27 23:48:05 +000013187 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013188 if (!PyUnicode_Check(x)) {
13189 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13190 "be a string if there is a second argument");
13191 goto err;
13192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013194 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13195 "arguments must have equal length");
13196 goto err;
13197 }
13198 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 x_kind = PyUnicode_KIND(x);
13200 y_kind = PyUnicode_KIND(y);
13201 x_data = PyUnicode_DATA(x);
13202 y_data = PyUnicode_DATA(y);
13203 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13204 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013205 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013206 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013207 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013208 if (!value) {
13209 Py_DECREF(key);
13210 goto err;
13211 }
Georg Brandlceee0772007-11-27 23:48:05 +000013212 res = PyDict_SetItem(new, key, value);
13213 Py_DECREF(key);
13214 Py_DECREF(value);
13215 if (res < 0)
13216 goto err;
13217 }
13218 /* create entries for deleting chars in z */
13219 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 z_kind = PyUnicode_KIND(z);
13221 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013222 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013223 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013224 if (!key)
13225 goto err;
13226 res = PyDict_SetItem(new, key, Py_None);
13227 Py_DECREF(key);
13228 if (res < 0)
13229 goto err;
13230 }
13231 }
13232 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 int kind;
13234 void *data;
13235
Georg Brandlceee0772007-11-27 23:48:05 +000013236 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013237 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013238 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13239 "to maketrans it must be a dict");
13240 goto err;
13241 }
13242 /* copy entries into the new dict, converting string keys to int keys */
13243 while (PyDict_Next(x, &i, &key, &value)) {
13244 if (PyUnicode_Check(key)) {
13245 /* convert string keys to integer keys */
13246 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013247 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013248 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13249 "table must be of length 1");
13250 goto err;
13251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 kind = PyUnicode_KIND(key);
13253 data = PyUnicode_DATA(key);
13254 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013255 if (!newkey)
13256 goto err;
13257 res = PyDict_SetItem(new, newkey, value);
13258 Py_DECREF(newkey);
13259 if (res < 0)
13260 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013261 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013262 /* just keep integer keys */
13263 if (PyDict_SetItem(new, key, value) < 0)
13264 goto err;
13265 } else {
13266 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13267 "be strings or integers");
13268 goto err;
13269 }
13270 }
13271 }
13272 return new;
13273 err:
13274 Py_DECREF(new);
13275 return NULL;
13276}
13277
INADA Naoki3ae20562017-01-16 20:41:20 +090013278/*[clinic input]
13279str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280
INADA Naoki3ae20562017-01-16 20:41:20 +090013281 table: object
13282 Translation table, which must be a mapping of Unicode ordinals to
13283 Unicode ordinals, strings, or None.
13284 /
13285
13286Replace each character in the string using the given translation table.
13287
13288The table must implement lookup/indexing via __getitem__, for instance a
13289dictionary or list. If this operation raises LookupError, the character is
13290left untouched. Characters mapped to None are deleted.
13291[clinic start generated code]*/
13292
13293static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013295/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298}
13299
INADA Naoki3ae20562017-01-16 20:41:20 +090013300/*[clinic input]
13301str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302
INADA Naoki3ae20562017-01-16 20:41:20 +090013303Return a copy of the string converted to uppercase.
13304[clinic start generated code]*/
13305
13306static PyObject *
13307unicode_upper_impl(PyObject *self)
13308/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013310 if (PyUnicode_READY(self) == -1)
13311 return NULL;
13312 if (PyUnicode_IS_ASCII(self))
13313 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013314 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315}
13316
INADA Naoki3ae20562017-01-16 20:41:20 +090013317/*[clinic input]
13318str.zfill as unicode_zfill
13319
13320 width: Py_ssize_t
13321 /
13322
13323Pad a numeric string with zeros on the left, to fill a field of the given width.
13324
13325The string is never truncated.
13326[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
13328static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013329unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013330/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013332 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013333 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 int kind;
13335 void *data;
13336 Py_UCS4 chr;
13337
Benjamin Petersonbac79492012-01-14 13:34:47 -050013338 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340
Victor Stinnerc4b49542011-12-11 22:44:26 +010013341 if (PyUnicode_GET_LENGTH(self) >= width)
13342 return unicode_result_unchanged(self);
13343
13344 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345
13346 u = pad(self, fill, 0, '0');
13347
Walter Dörwald068325e2002-04-15 13:36:47 +000013348 if (u == NULL)
13349 return NULL;
13350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013351 kind = PyUnicode_KIND(u);
13352 data = PyUnicode_DATA(u);
13353 chr = PyUnicode_READ(kind, data, fill);
13354
13355 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 PyUnicode_WRITE(kind, data, 0, chr);
13358 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013359 }
13360
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013361 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013362 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364
13365#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013366static PyObject *
13367unicode__decimal2ascii(PyObject *self)
13368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013370}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371#endif
13372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013373PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013374 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013376Return True if S starts with the specified prefix, False otherwise.\n\
13377With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013378With optional end, stop comparing S at that position.\n\
13379prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380
13381static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013382unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013385 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013386 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013387 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013388 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013389 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390
Jesus Ceaac451502011-04-20 17:09:23 +020013391 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013392 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013393 if (PyTuple_Check(subobj)) {
13394 Py_ssize_t i;
13395 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 substring = PyTuple_GET_ITEM(subobj, i);
13397 if (!PyUnicode_Check(substring)) {
13398 PyErr_Format(PyExc_TypeError,
13399 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013400 "not %.100s",
13401 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013402 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013405 if (result == -1)
13406 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013407 if (result) {
13408 Py_RETURN_TRUE;
13409 }
13410 }
13411 /* nothing matched */
13412 Py_RETURN_FALSE;
13413 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013414 if (!PyUnicode_Check(subobj)) {
13415 PyErr_Format(PyExc_TypeError,
13416 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013417 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013419 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013420 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013421 if (result == -1)
13422 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013423 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013424}
13425
13426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013427PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013430Return True if S ends with the specified suffix, False otherwise.\n\
13431With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013432With optional end, stop comparing S at that position.\n\
13433suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434
13435static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013436unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013439 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013440 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013441 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013442 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013443 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444
Jesus Ceaac451502011-04-20 17:09:23 +020013445 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013446 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013447 if (PyTuple_Check(subobj)) {
13448 Py_ssize_t i;
13449 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013450 substring = PyTuple_GET_ITEM(subobj, i);
13451 if (!PyUnicode_Check(substring)) {
13452 PyErr_Format(PyExc_TypeError,
13453 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013454 "not %.100s",
13455 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013457 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013459 if (result == -1)
13460 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013461 if (result) {
13462 Py_RETURN_TRUE;
13463 }
13464 }
13465 Py_RETURN_FALSE;
13466 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013467 if (!PyUnicode_Check(subobj)) {
13468 PyErr_Format(PyExc_TypeError,
13469 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013470 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013472 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013473 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013474 if (result == -1)
13475 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013476 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013477}
13478
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013479static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013480_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013481{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013482 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13483 writer->data = PyUnicode_DATA(writer->buffer);
13484
13485 if (!writer->readonly) {
13486 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013487 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013488 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013489 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013490 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13491 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13492 writer->kind = PyUnicode_WCHAR_KIND;
13493 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13494
Victor Stinner8f674cc2013-04-17 23:02:17 +020013495 /* Copy-on-write mode: set buffer size to 0 so
13496 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13497 * next write. */
13498 writer->size = 0;
13499 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013500}
13501
Victor Stinnerd3f08822012-05-29 12:57:52 +020013502void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013503_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013504{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013505 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013506
13507 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013508 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013509
13510 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13511 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13512 writer->kind = PyUnicode_WCHAR_KIND;
13513 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013514}
13515
Inada Naoki770847a2019-06-24 12:30:24 +090013516// Initialize _PyUnicodeWriter with initial buffer
13517static inline void
13518_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13519{
13520 memset(writer, 0, sizeof(*writer));
13521 writer->buffer = buffer;
13522 _PyUnicodeWriter_Update(writer);
13523 writer->min_length = writer->size;
13524}
13525
Victor Stinnerd3f08822012-05-29 12:57:52 +020013526int
13527_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13528 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013529{
13530 Py_ssize_t newlen;
13531 PyObject *newbuffer;
13532
Victor Stinner2740e462016-09-06 16:58:36 -070013533 assert(maxchar <= MAX_UNICODE);
13534
Victor Stinnerca9381e2015-09-22 00:58:32 +020013535 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013536 assert((maxchar > writer->maxchar && length >= 0)
13537 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538
Victor Stinner202fdca2012-05-07 12:47:02 +020013539 if (length > PY_SSIZE_T_MAX - writer->pos) {
13540 PyErr_NoMemory();
13541 return -1;
13542 }
13543 newlen = writer->pos + length;
13544
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013545 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013546
Victor Stinnerd3f08822012-05-29 12:57:52 +020013547 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013548 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013549 if (writer->overallocate
13550 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13551 /* overallocate to limit the number of realloc() */
13552 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013554 if (newlen < writer->min_length)
13555 newlen = writer->min_length;
13556
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 writer->buffer = PyUnicode_New(newlen, maxchar);
13558 if (writer->buffer == NULL)
13559 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013561 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013562 if (writer->overallocate
13563 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13564 /* overallocate to limit the number of realloc() */
13565 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013566 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567 if (newlen < writer->min_length)
13568 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013570 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013571 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013572 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013573 newbuffer = PyUnicode_New(newlen, maxchar);
13574 if (newbuffer == NULL)
13575 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013576 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13577 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013578 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013579 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013580 }
13581 else {
13582 newbuffer = resize_compact(writer->buffer, newlen);
13583 if (newbuffer == NULL)
13584 return -1;
13585 }
13586 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013587 }
13588 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013589 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 newbuffer = PyUnicode_New(writer->size, maxchar);
13591 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013592 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013593 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13594 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013595 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013596 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013597 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013598 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013599
13600#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013601}
13602
Victor Stinnerca9381e2015-09-22 00:58:32 +020013603int
13604_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13605 enum PyUnicode_Kind kind)
13606{
13607 Py_UCS4 maxchar;
13608
13609 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13610 assert(writer->kind < kind);
13611
13612 switch (kind)
13613 {
13614 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13615 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13616 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13617 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013618 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013619 }
13620
13621 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13622}
13623
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013624static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013625_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013626{
Victor Stinner2740e462016-09-06 16:58:36 -070013627 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013628 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13629 return -1;
13630 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13631 writer->pos++;
13632 return 0;
13633}
13634
13635int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013636_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13637{
13638 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13639}
13640
13641int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013642_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13643{
13644 Py_UCS4 maxchar;
13645 Py_ssize_t len;
13646
13647 if (PyUnicode_READY(str) == -1)
13648 return -1;
13649 len = PyUnicode_GET_LENGTH(str);
13650 if (len == 0)
13651 return 0;
13652 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13653 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013654 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013655 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013656 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013657 Py_INCREF(str);
13658 writer->buffer = str;
13659 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013660 writer->pos += len;
13661 return 0;
13662 }
13663 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13664 return -1;
13665 }
13666 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13667 str, 0, len);
13668 writer->pos += len;
13669 return 0;
13670}
13671
Victor Stinnere215d962012-10-06 23:03:36 +020013672int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013673_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13674 Py_ssize_t start, Py_ssize_t end)
13675{
13676 Py_UCS4 maxchar;
13677 Py_ssize_t len;
13678
13679 if (PyUnicode_READY(str) == -1)
13680 return -1;
13681
13682 assert(0 <= start);
13683 assert(end <= PyUnicode_GET_LENGTH(str));
13684 assert(start <= end);
13685
13686 if (end == 0)
13687 return 0;
13688
13689 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13690 return _PyUnicodeWriter_WriteStr(writer, str);
13691
13692 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13693 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13694 else
13695 maxchar = writer->maxchar;
13696 len = end - start;
13697
13698 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13699 return -1;
13700
13701 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13702 str, start, len);
13703 writer->pos += len;
13704 return 0;
13705}
13706
13707int
Victor Stinner4a587072013-11-19 12:54:53 +010013708_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13709 const char *ascii, Py_ssize_t len)
13710{
13711 if (len == -1)
13712 len = strlen(ascii);
13713
13714 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13715
13716 if (writer->buffer == NULL && !writer->overallocate) {
13717 PyObject *str;
13718
13719 str = _PyUnicode_FromASCII(ascii, len);
13720 if (str == NULL)
13721 return -1;
13722
13723 writer->readonly = 1;
13724 writer->buffer = str;
13725 _PyUnicodeWriter_Update(writer);
13726 writer->pos += len;
13727 return 0;
13728 }
13729
13730 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13731 return -1;
13732
13733 switch (writer->kind)
13734 {
13735 case PyUnicode_1BYTE_KIND:
13736 {
13737 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13738 Py_UCS1 *data = writer->data;
13739
Christian Heimesf051e432016-09-13 20:22:02 +020013740 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013741 break;
13742 }
13743 case PyUnicode_2BYTE_KIND:
13744 {
13745 _PyUnicode_CONVERT_BYTES(
13746 Py_UCS1, Py_UCS2,
13747 ascii, ascii + len,
13748 (Py_UCS2 *)writer->data + writer->pos);
13749 break;
13750 }
13751 case PyUnicode_4BYTE_KIND:
13752 {
13753 _PyUnicode_CONVERT_BYTES(
13754 Py_UCS1, Py_UCS4,
13755 ascii, ascii + len,
13756 (Py_UCS4 *)writer->data + writer->pos);
13757 break;
13758 }
13759 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013760 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013761 }
13762
13763 writer->pos += len;
13764 return 0;
13765}
13766
13767int
13768_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13769 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013770{
13771 Py_UCS4 maxchar;
13772
13773 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13774 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13775 return -1;
13776 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13777 writer->pos += len;
13778 return 0;
13779}
13780
Victor Stinnerd3f08822012-05-29 12:57:52 +020013781PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013782_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013783{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013784 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013785
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013787 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013788 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013789 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013790
13791 str = writer->buffer;
13792 writer->buffer = NULL;
13793
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013794 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013795 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13796 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013797 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013798
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013799 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13800 PyObject *str2;
13801 str2 = resize_compact(str, writer->pos);
13802 if (str2 == NULL) {
13803 Py_DECREF(str);
13804 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013805 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013806 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013807 }
13808
Victor Stinner15a0bd32013-07-08 22:29:55 +020013809 assert(_PyUnicode_CheckConsistency(str, 1));
13810 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013811}
13812
Victor Stinnerd3f08822012-05-29 12:57:52 +020013813void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013814_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013815{
13816 Py_CLEAR(writer->buffer);
13817}
13818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013819#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013820
13821PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013823\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013824Return a formatted version of S, using substitutions from args and kwargs.\n\
13825The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013826
Eric Smith27bbca62010-11-04 17:06:58 +000013827PyDoc_STRVAR(format_map__doc__,
13828 "S.format_map(mapping) -> str\n\
13829\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013830Return a formatted version of S, using substitutions from mapping.\n\
13831The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013832
INADA Naoki3ae20562017-01-16 20:41:20 +090013833/*[clinic input]
13834str.__format__ as unicode___format__
13835
13836 format_spec: unicode
13837 /
13838
13839Return a formatted version of the string as described by format_spec.
13840[clinic start generated code]*/
13841
Eric Smith4a7d76d2008-05-30 18:10:19 +000013842static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013843unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013844/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013845{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013846 _PyUnicodeWriter writer;
13847 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013848
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 if (PyUnicode_READY(self) == -1)
13850 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013851 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013852 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13853 self, format_spec, 0,
13854 PyUnicode_GET_LENGTH(format_spec));
13855 if (ret == -1) {
13856 _PyUnicodeWriter_Dealloc(&writer);
13857 return NULL;
13858 }
13859 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013860}
13861
INADA Naoki3ae20562017-01-16 20:41:20 +090013862/*[clinic input]
13863str.__sizeof__ as unicode_sizeof
13864
13865Return the size of the string in memory, in bytes.
13866[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013867
13868static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013869unicode_sizeof_impl(PyObject *self)
13870/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013872 Py_ssize_t size;
13873
13874 /* If it's a compact object, account for base structure +
13875 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013876 if (PyUnicode_IS_COMPACT_ASCII(self))
13877 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13878 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013879 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 else {
13882 /* If it is a two-block object, account for base object, and
13883 for character block if present. */
13884 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013885 if (_PyUnicode_DATA_ANY(self))
13886 size += (PyUnicode_GET_LENGTH(self) + 1) *
13887 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 }
13889 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013890 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013891 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13892 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13893 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13894 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895
13896 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013897}
13898
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013899static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013900unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013901{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013902 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013903 if (!copy)
13904 return NULL;
13905 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013906}
13907
Guido van Rossumd57fd912000-03-10 22:53:23 +000013908static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013909 UNICODE_ENCODE_METHODDEF
13910 UNICODE_REPLACE_METHODDEF
13911 UNICODE_SPLIT_METHODDEF
13912 UNICODE_RSPLIT_METHODDEF
13913 UNICODE_JOIN_METHODDEF
13914 UNICODE_CAPITALIZE_METHODDEF
13915 UNICODE_CASEFOLD_METHODDEF
13916 UNICODE_TITLE_METHODDEF
13917 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013918 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013919 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013920 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013921 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013922 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013923 UNICODE_LJUST_METHODDEF
13924 UNICODE_LOWER_METHODDEF
13925 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013926 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13927 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013928 UNICODE_RJUST_METHODDEF
13929 UNICODE_RSTRIP_METHODDEF
13930 UNICODE_RPARTITION_METHODDEF
13931 UNICODE_SPLITLINES_METHODDEF
13932 UNICODE_STRIP_METHODDEF
13933 UNICODE_SWAPCASE_METHODDEF
13934 UNICODE_TRANSLATE_METHODDEF
13935 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013936 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13937 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013938 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013939 UNICODE_ISLOWER_METHODDEF
13940 UNICODE_ISUPPER_METHODDEF
13941 UNICODE_ISTITLE_METHODDEF
13942 UNICODE_ISSPACE_METHODDEF
13943 UNICODE_ISDECIMAL_METHODDEF
13944 UNICODE_ISDIGIT_METHODDEF
13945 UNICODE_ISNUMERIC_METHODDEF
13946 UNICODE_ISALPHA_METHODDEF
13947 UNICODE_ISALNUM_METHODDEF
13948 UNICODE_ISIDENTIFIER_METHODDEF
13949 UNICODE_ISPRINTABLE_METHODDEF
13950 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013951 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013952 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013953 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013954 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013955 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013956#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013957 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013958 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959#endif
13960
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013961 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013962 {NULL, NULL}
13963};
13964
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013965static PyObject *
13966unicode_mod(PyObject *v, PyObject *w)
13967{
Brian Curtindfc80e32011-08-10 20:28:54 -050013968 if (!PyUnicode_Check(v))
13969 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013970 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013971}
13972
13973static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013974 0, /*nb_add*/
13975 0, /*nb_subtract*/
13976 0, /*nb_multiply*/
13977 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013978};
13979
Guido van Rossumd57fd912000-03-10 22:53:23 +000013980static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 (lenfunc) unicode_length, /* sq_length */
13982 PyUnicode_Concat, /* sq_concat */
13983 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13984 (ssizeargfunc) unicode_getitem, /* sq_item */
13985 0, /* sq_slice */
13986 0, /* sq_ass_item */
13987 0, /* sq_ass_slice */
13988 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013989};
13990
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013991static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013992unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013994 if (PyUnicode_READY(self) == -1)
13995 return NULL;
13996
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013997 if (PyIndex_Check(item)) {
13998 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013999 if (i == -1 && PyErr_Occurred())
14000 return NULL;
14001 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014002 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014003 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014004 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014005 Py_ssize_t start, stop, step, slicelength, i;
14006 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014007 PyObject *result;
14008 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014009 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014010 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014011
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014012 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014013 return NULL;
14014 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014015 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14016 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014017
14018 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014019 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014020 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014021 slicelength == PyUnicode_GET_LENGTH(self)) {
14022 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014023 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014024 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014025 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014026 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014027 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014028 src_kind = PyUnicode_KIND(self);
14029 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014030 if (!PyUnicode_IS_ASCII(self)) {
14031 kind_limit = kind_maxchar_limit(src_kind);
14032 max_char = 0;
14033 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14034 ch = PyUnicode_READ(src_kind, src_data, cur);
14035 if (ch > max_char) {
14036 max_char = ch;
14037 if (max_char >= kind_limit)
14038 break;
14039 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014040 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014041 }
Victor Stinner55c99112011-10-13 01:17:06 +020014042 else
14043 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014044 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014045 if (result == NULL)
14046 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014047 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014048 dest_data = PyUnicode_DATA(result);
14049
14050 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014051 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14052 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014053 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014054 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014055 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014056 } else {
14057 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14058 return NULL;
14059 }
14060}
14061
14062static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 (lenfunc)unicode_length, /* mp_length */
14064 (binaryfunc)unicode_subscript, /* mp_subscript */
14065 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014066};
14067
Guido van Rossumd57fd912000-03-10 22:53:23 +000014068
Guido van Rossumd57fd912000-03-10 22:53:23 +000014069/* Helpers for PyUnicode_Format() */
14070
Victor Stinnera47082312012-10-04 02:19:54 +020014071struct unicode_formatter_t {
14072 PyObject *args;
14073 int args_owned;
14074 Py_ssize_t arglen, argidx;
14075 PyObject *dict;
14076
14077 enum PyUnicode_Kind fmtkind;
14078 Py_ssize_t fmtcnt, fmtpos;
14079 void *fmtdata;
14080 PyObject *fmtstr;
14081
14082 _PyUnicodeWriter writer;
14083};
14084
14085struct unicode_format_arg_t {
14086 Py_UCS4 ch;
14087 int flags;
14088 Py_ssize_t width;
14089 int prec;
14090 int sign;
14091};
14092
Guido van Rossumd57fd912000-03-10 22:53:23 +000014093static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014094unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095{
Victor Stinnera47082312012-10-04 02:19:54 +020014096 Py_ssize_t argidx = ctx->argidx;
14097
14098 if (argidx < ctx->arglen) {
14099 ctx->argidx++;
14100 if (ctx->arglen < 0)
14101 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014102 else
Victor Stinnera47082312012-10-04 02:19:54 +020014103 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104 }
14105 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014107 return NULL;
14108}
14109
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014110/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111
Victor Stinnera47082312012-10-04 02:19:54 +020014112/* Format a float into the writer if the writer is not NULL, or into *p_output
14113 otherwise.
14114
14115 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116static int
Victor Stinnera47082312012-10-04 02:19:54 +020014117formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14118 PyObject **p_output,
14119 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014120{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014121 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014122 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014123 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014124 int prec;
14125 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014126
Guido van Rossumd57fd912000-03-10 22:53:23 +000014127 x = PyFloat_AsDouble(v);
14128 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014129 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014130
Victor Stinnera47082312012-10-04 02:19:54 +020014131 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014132 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014133 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014134
Victor Stinnera47082312012-10-04 02:19:54 +020014135 if (arg->flags & F_ALT)
14136 dtoa_flags = Py_DTSF_ALT;
14137 else
14138 dtoa_flags = 0;
14139 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014140 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014141 return -1;
14142 len = strlen(p);
14143 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014144 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014145 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014146 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014147 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014148 }
14149 else
14150 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014151 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014152 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014153}
14154
Victor Stinnerd0880d52012-04-27 23:40:13 +020014155/* formatlong() emulates the format codes d, u, o, x and X, and
14156 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14157 * Python's regular ints.
14158 * Return value: a new PyUnicodeObject*, or NULL if error.
14159 * The output string is of the form
14160 * "-"? ("0x" | "0X")? digit+
14161 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14162 * set in flags. The case of hex digits will be correct,
14163 * There will be at least prec digits, zero-filled on the left if
14164 * necessary to get that many.
14165 * val object to be converted
14166 * flags bitmask of format flags; only F_ALT is looked at
14167 * prec minimum number of digits; 0-fill on left if needed
14168 * type a character in [duoxX]; u acts the same as d
14169 *
14170 * CAUTION: o, x and X conversions on regular ints can never
14171 * produce a '-' sign, but can for Python's unbounded ints.
14172 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014173PyObject *
14174_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014175{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014176 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014178 Py_ssize_t i;
14179 int sign; /* 1 if '-', else 0 */
14180 int len; /* number of characters */
14181 Py_ssize_t llen;
14182 int numdigits; /* len == numnondigits + numdigits */
14183 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014184
Victor Stinnerd0880d52012-04-27 23:40:13 +020014185 /* Avoid exceeding SSIZE_T_MAX */
14186 if (prec > INT_MAX-3) {
14187 PyErr_SetString(PyExc_OverflowError,
14188 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014189 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 }
14191
14192 assert(PyLong_Check(val));
14193
14194 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014195 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014196 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014197 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014198 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014199 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014200 /* int and int subclasses should print numerically when a numeric */
14201 /* format code is used (see issue18780) */
14202 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014203 break;
14204 case 'o':
14205 numnondigits = 2;
14206 result = PyNumber_ToBase(val, 8);
14207 break;
14208 case 'x':
14209 case 'X':
14210 numnondigits = 2;
14211 result = PyNumber_ToBase(val, 16);
14212 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014213 }
14214 if (!result)
14215 return NULL;
14216
14217 assert(unicode_modifiable(result));
14218 assert(PyUnicode_IS_READY(result));
14219 assert(PyUnicode_IS_ASCII(result));
14220
14221 /* To modify the string in-place, there can only be one reference. */
14222 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014223 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014224 PyErr_BadInternalCall();
14225 return NULL;
14226 }
14227 buf = PyUnicode_DATA(result);
14228 llen = PyUnicode_GET_LENGTH(result);
14229 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014230 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014231 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014232 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014233 return NULL;
14234 }
14235 len = (int)llen;
14236 sign = buf[0] == '-';
14237 numnondigits += sign;
14238 numdigits = len - numnondigits;
14239 assert(numdigits > 0);
14240
14241 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014242 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014243 (type == 'o' || type == 'x' || type == 'X'))) {
14244 assert(buf[sign] == '0');
14245 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14246 buf[sign+1] == 'o');
14247 numnondigits -= 2;
14248 buf += 2;
14249 len -= 2;
14250 if (sign)
14251 buf[0] = '-';
14252 assert(len == numnondigits + numdigits);
14253 assert(numdigits > 0);
14254 }
14255
14256 /* Fill with leading zeroes to meet minimum width. */
14257 if (prec > numdigits) {
14258 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14259 numnondigits + prec);
14260 char *b1;
14261 if (!r1) {
14262 Py_DECREF(result);
14263 return NULL;
14264 }
14265 b1 = PyBytes_AS_STRING(r1);
14266 for (i = 0; i < numnondigits; ++i)
14267 *b1++ = *buf++;
14268 for (i = 0; i < prec - numdigits; i++)
14269 *b1++ = '0';
14270 for (i = 0; i < numdigits; i++)
14271 *b1++ = *buf++;
14272 *b1 = '\0';
14273 Py_DECREF(result);
14274 result = r1;
14275 buf = PyBytes_AS_STRING(result);
14276 len = numnondigits + prec;
14277 }
14278
14279 /* Fix up case for hex conversions. */
14280 if (type == 'X') {
14281 /* Need to convert all lower case letters to upper case.
14282 and need to convert 0x to 0X (and -0x to -0X). */
14283 for (i = 0; i < len; i++)
14284 if (buf[i] >= 'a' && buf[i] <= 'x')
14285 buf[i] -= 'a'-'A';
14286 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 if (!PyUnicode_Check(result)
14288 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014289 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014290 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014291 Py_DECREF(result);
14292 result = unicode;
14293 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014294 else if (len != PyUnicode_GET_LENGTH(result)) {
14295 if (PyUnicode_Resize(&result, len) < 0)
14296 Py_CLEAR(result);
14297 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014298 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014299}
14300
Ethan Furmandf3ed242014-01-05 06:50:30 -080014301/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014302 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014303 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304 * -1 and raise an exception on error */
14305static int
Victor Stinnera47082312012-10-04 02:19:54 +020014306mainformatlong(PyObject *v,
14307 struct unicode_format_arg_t *arg,
14308 PyObject **p_output,
14309 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310{
14311 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014312 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014313
14314 if (!PyNumber_Check(v))
14315 goto wrongtype;
14316
Ethan Furman9ab74802014-03-21 06:38:46 -070014317 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014318 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014319 if (type == 'o' || type == 'x' || type == 'X') {
14320 iobj = PyNumber_Index(v);
14321 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014322 if (PyErr_ExceptionMatches(PyExc_TypeError))
14323 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014324 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014325 }
14326 }
14327 else {
14328 iobj = PyNumber_Long(v);
14329 if (iobj == NULL ) {
14330 if (PyErr_ExceptionMatches(PyExc_TypeError))
14331 goto wrongtype;
14332 return -1;
14333 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014334 }
14335 assert(PyLong_Check(iobj));
14336 }
14337 else {
14338 iobj = v;
14339 Py_INCREF(iobj);
14340 }
14341
14342 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014343 && arg->width == -1 && arg->prec == -1
14344 && !(arg->flags & (F_SIGN | F_BLANK))
14345 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014346 {
14347 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014348 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014349 int base;
14350
Victor Stinnera47082312012-10-04 02:19:54 +020014351 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014352 {
14353 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014354 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014355 case 'd':
14356 case 'i':
14357 case 'u':
14358 base = 10;
14359 break;
14360 case 'o':
14361 base = 8;
14362 break;
14363 case 'x':
14364 case 'X':
14365 base = 16;
14366 break;
14367 }
14368
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014369 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14370 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014371 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014372 }
14373 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014374 return 1;
14375 }
14376
Ethan Furmanb95b5612015-01-23 20:05:18 -080014377 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014378 Py_DECREF(iobj);
14379 if (res == NULL)
14380 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014381 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014382 return 0;
14383
14384wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014385 switch(type)
14386 {
14387 case 'o':
14388 case 'x':
14389 case 'X':
14390 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014391 "%%%c format: an integer is required, "
14392 "not %.200s",
14393 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014394 break;
14395 default:
14396 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014397 "%%%c format: a number is required, "
14398 "not %.200s",
14399 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014400 break;
14401 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014402 return -1;
14403}
14404
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014405static Py_UCS4
14406formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014407{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014408 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014409 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014410 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014411 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014412 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014413 goto onError;
14414 }
14415 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014416 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014418 /* make sure number is a type of integer */
14419 if (!PyLong_Check(v)) {
14420 iobj = PyNumber_Index(v);
14421 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014422 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014423 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014424 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014425 Py_DECREF(iobj);
14426 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014427 else {
14428 x = PyLong_AsLong(v);
14429 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014430 if (x == -1 && PyErr_Occurred())
14431 goto onError;
14432
Victor Stinner8faf8212011-12-08 22:14:11 +010014433 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014434 PyErr_SetString(PyExc_OverflowError,
14435 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014436 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014437 }
14438
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014439 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014440 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014441
Benjamin Peterson29060642009-01-31 22:14:21 +000014442 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014443 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014444 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014445 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014446}
14447
Victor Stinnera47082312012-10-04 02:19:54 +020014448/* Parse options of an argument: flags, width, precision.
14449 Handle also "%(name)" syntax.
14450
14451 Return 0 if the argument has been formatted into arg->str.
14452 Return 1 if the argument has been written into ctx->writer,
14453 Raise an exception and return -1 on error. */
14454static int
14455unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14456 struct unicode_format_arg_t *arg)
14457{
14458#define FORMAT_READ(ctx) \
14459 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14460
14461 PyObject *v;
14462
Victor Stinnera47082312012-10-04 02:19:54 +020014463 if (arg->ch == '(') {
14464 /* Get argument value from a dictionary. Example: "%(name)s". */
14465 Py_ssize_t keystart;
14466 Py_ssize_t keylen;
14467 PyObject *key;
14468 int pcount = 1;
14469
14470 if (ctx->dict == NULL) {
14471 PyErr_SetString(PyExc_TypeError,
14472 "format requires a mapping");
14473 return -1;
14474 }
14475 ++ctx->fmtpos;
14476 --ctx->fmtcnt;
14477 keystart = ctx->fmtpos;
14478 /* Skip over balanced parentheses */
14479 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14480 arg->ch = FORMAT_READ(ctx);
14481 if (arg->ch == ')')
14482 --pcount;
14483 else if (arg->ch == '(')
14484 ++pcount;
14485 ctx->fmtpos++;
14486 }
14487 keylen = ctx->fmtpos - keystart - 1;
14488 if (ctx->fmtcnt < 0 || pcount > 0) {
14489 PyErr_SetString(PyExc_ValueError,
14490 "incomplete format key");
14491 return -1;
14492 }
14493 key = PyUnicode_Substring(ctx->fmtstr,
14494 keystart, keystart + keylen);
14495 if (key == NULL)
14496 return -1;
14497 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014498 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014499 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014500 }
14501 ctx->args = PyObject_GetItem(ctx->dict, key);
14502 Py_DECREF(key);
14503 if (ctx->args == NULL)
14504 return -1;
14505 ctx->args_owned = 1;
14506 ctx->arglen = -1;
14507 ctx->argidx = -2;
14508 }
14509
14510 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014511 while (--ctx->fmtcnt >= 0) {
14512 arg->ch = FORMAT_READ(ctx);
14513 ctx->fmtpos++;
14514 switch (arg->ch) {
14515 case '-': arg->flags |= F_LJUST; continue;
14516 case '+': arg->flags |= F_SIGN; continue;
14517 case ' ': arg->flags |= F_BLANK; continue;
14518 case '#': arg->flags |= F_ALT; continue;
14519 case '0': arg->flags |= F_ZERO; continue;
14520 }
14521 break;
14522 }
14523
14524 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014525 if (arg->ch == '*') {
14526 v = unicode_format_getnextarg(ctx);
14527 if (v == NULL)
14528 return -1;
14529 if (!PyLong_Check(v)) {
14530 PyErr_SetString(PyExc_TypeError,
14531 "* wants int");
14532 return -1;
14533 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014534 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014535 if (arg->width == -1 && PyErr_Occurred())
14536 return -1;
14537 if (arg->width < 0) {
14538 arg->flags |= F_LJUST;
14539 arg->width = -arg->width;
14540 }
14541 if (--ctx->fmtcnt >= 0) {
14542 arg->ch = FORMAT_READ(ctx);
14543 ctx->fmtpos++;
14544 }
14545 }
14546 else if (arg->ch >= '0' && arg->ch <= '9') {
14547 arg->width = arg->ch - '0';
14548 while (--ctx->fmtcnt >= 0) {
14549 arg->ch = FORMAT_READ(ctx);
14550 ctx->fmtpos++;
14551 if (arg->ch < '0' || arg->ch > '9')
14552 break;
14553 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14554 mixing signed and unsigned comparison. Since arg->ch is between
14555 '0' and '9', casting to int is safe. */
14556 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14557 PyErr_SetString(PyExc_ValueError,
14558 "width too big");
14559 return -1;
14560 }
14561 arg->width = arg->width*10 + (arg->ch - '0');
14562 }
14563 }
14564
14565 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014566 if (arg->ch == '.') {
14567 arg->prec = 0;
14568 if (--ctx->fmtcnt >= 0) {
14569 arg->ch = FORMAT_READ(ctx);
14570 ctx->fmtpos++;
14571 }
14572 if (arg->ch == '*') {
14573 v = unicode_format_getnextarg(ctx);
14574 if (v == NULL)
14575 return -1;
14576 if (!PyLong_Check(v)) {
14577 PyErr_SetString(PyExc_TypeError,
14578 "* wants int");
14579 return -1;
14580 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014581 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014582 if (arg->prec == -1 && PyErr_Occurred())
14583 return -1;
14584 if (arg->prec < 0)
14585 arg->prec = 0;
14586 if (--ctx->fmtcnt >= 0) {
14587 arg->ch = FORMAT_READ(ctx);
14588 ctx->fmtpos++;
14589 }
14590 }
14591 else if (arg->ch >= '0' && arg->ch <= '9') {
14592 arg->prec = arg->ch - '0';
14593 while (--ctx->fmtcnt >= 0) {
14594 arg->ch = FORMAT_READ(ctx);
14595 ctx->fmtpos++;
14596 if (arg->ch < '0' || arg->ch > '9')
14597 break;
14598 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14599 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014600 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014601 return -1;
14602 }
14603 arg->prec = arg->prec*10 + (arg->ch - '0');
14604 }
14605 }
14606 }
14607
14608 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14609 if (ctx->fmtcnt >= 0) {
14610 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14611 if (--ctx->fmtcnt >= 0) {
14612 arg->ch = FORMAT_READ(ctx);
14613 ctx->fmtpos++;
14614 }
14615 }
14616 }
14617 if (ctx->fmtcnt < 0) {
14618 PyErr_SetString(PyExc_ValueError,
14619 "incomplete format");
14620 return -1;
14621 }
14622 return 0;
14623
14624#undef FORMAT_READ
14625}
14626
14627/* Format one argument. Supported conversion specifiers:
14628
14629 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014630 - "i", "d", "u": int or float
14631 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014632 - "e", "E", "f", "F", "g", "G": float
14633 - "c": int or str (1 character)
14634
Victor Stinner8dbd4212012-12-04 09:30:24 +010014635 When possible, the output is written directly into the Unicode writer
14636 (ctx->writer). A string is created when padding is required.
14637
Victor Stinnera47082312012-10-04 02:19:54 +020014638 Return 0 if the argument has been formatted into *p_str,
14639 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014640 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014641static int
14642unicode_format_arg_format(struct unicode_formatter_t *ctx,
14643 struct unicode_format_arg_t *arg,
14644 PyObject **p_str)
14645{
14646 PyObject *v;
14647 _PyUnicodeWriter *writer = &ctx->writer;
14648
14649 if (ctx->fmtcnt == 0)
14650 ctx->writer.overallocate = 0;
14651
Victor Stinnera47082312012-10-04 02:19:54 +020014652 v = unicode_format_getnextarg(ctx);
14653 if (v == NULL)
14654 return -1;
14655
Victor Stinnera47082312012-10-04 02:19:54 +020014656
14657 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014658 case 's':
14659 case 'r':
14660 case 'a':
14661 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14662 /* Fast path */
14663 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14664 return -1;
14665 return 1;
14666 }
14667
14668 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14669 *p_str = v;
14670 Py_INCREF(*p_str);
14671 }
14672 else {
14673 if (arg->ch == 's')
14674 *p_str = PyObject_Str(v);
14675 else if (arg->ch == 'r')
14676 *p_str = PyObject_Repr(v);
14677 else
14678 *p_str = PyObject_ASCII(v);
14679 }
14680 break;
14681
14682 case 'i':
14683 case 'd':
14684 case 'u':
14685 case 'o':
14686 case 'x':
14687 case 'X':
14688 {
14689 int ret = mainformatlong(v, arg, p_str, writer);
14690 if (ret != 0)
14691 return ret;
14692 arg->sign = 1;
14693 break;
14694 }
14695
14696 case 'e':
14697 case 'E':
14698 case 'f':
14699 case 'F':
14700 case 'g':
14701 case 'G':
14702 if (arg->width == -1 && arg->prec == -1
14703 && !(arg->flags & (F_SIGN | F_BLANK)))
14704 {
14705 /* Fast path */
14706 if (formatfloat(v, arg, NULL, writer) == -1)
14707 return -1;
14708 return 1;
14709 }
14710
14711 arg->sign = 1;
14712 if (formatfloat(v, arg, p_str, NULL) == -1)
14713 return -1;
14714 break;
14715
14716 case 'c':
14717 {
14718 Py_UCS4 ch = formatchar(v);
14719 if (ch == (Py_UCS4) -1)
14720 return -1;
14721 if (arg->width == -1 && arg->prec == -1) {
14722 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014723 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014724 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014725 return 1;
14726 }
14727 *p_str = PyUnicode_FromOrdinal(ch);
14728 break;
14729 }
14730
14731 default:
14732 PyErr_Format(PyExc_ValueError,
14733 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014734 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014735 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14736 (int)arg->ch,
14737 ctx->fmtpos - 1);
14738 return -1;
14739 }
14740 if (*p_str == NULL)
14741 return -1;
14742 assert (PyUnicode_Check(*p_str));
14743 return 0;
14744}
14745
14746static int
14747unicode_format_arg_output(struct unicode_formatter_t *ctx,
14748 struct unicode_format_arg_t *arg,
14749 PyObject *str)
14750{
14751 Py_ssize_t len;
14752 enum PyUnicode_Kind kind;
14753 void *pbuf;
14754 Py_ssize_t pindex;
14755 Py_UCS4 signchar;
14756 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014757 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014758 Py_ssize_t sublen;
14759 _PyUnicodeWriter *writer = &ctx->writer;
14760 Py_UCS4 fill;
14761
14762 fill = ' ';
14763 if (arg->sign && arg->flags & F_ZERO)
14764 fill = '0';
14765
14766 if (PyUnicode_READY(str) == -1)
14767 return -1;
14768
14769 len = PyUnicode_GET_LENGTH(str);
14770 if ((arg->width == -1 || arg->width <= len)
14771 && (arg->prec == -1 || arg->prec >= len)
14772 && !(arg->flags & (F_SIGN | F_BLANK)))
14773 {
14774 /* Fast path */
14775 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14776 return -1;
14777 return 0;
14778 }
14779
14780 /* Truncate the string for "s", "r" and "a" formats
14781 if the precision is set */
14782 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14783 if (arg->prec >= 0 && len > arg->prec)
14784 len = arg->prec;
14785 }
14786
14787 /* Adjust sign and width */
14788 kind = PyUnicode_KIND(str);
14789 pbuf = PyUnicode_DATA(str);
14790 pindex = 0;
14791 signchar = '\0';
14792 if (arg->sign) {
14793 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14794 if (ch == '-' || ch == '+') {
14795 signchar = ch;
14796 len--;
14797 pindex++;
14798 }
14799 else if (arg->flags & F_SIGN)
14800 signchar = '+';
14801 else if (arg->flags & F_BLANK)
14802 signchar = ' ';
14803 else
14804 arg->sign = 0;
14805 }
14806 if (arg->width < len)
14807 arg->width = len;
14808
14809 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014810 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014811 if (!(arg->flags & F_LJUST)) {
14812 if (arg->sign) {
14813 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014814 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014815 }
14816 else {
14817 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014818 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014819 }
14820 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014821 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14822 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014823 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014824 }
14825
Victor Stinnera47082312012-10-04 02:19:54 +020014826 buflen = arg->width;
14827 if (arg->sign && len == arg->width)
14828 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014829 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014830 return -1;
14831
14832 /* Write the sign if needed */
14833 if (arg->sign) {
14834 if (fill != ' ') {
14835 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14836 writer->pos += 1;
14837 }
14838 if (arg->width > len)
14839 arg->width--;
14840 }
14841
14842 /* Write the numeric prefix for "x", "X" and "o" formats
14843 if the alternate form is used.
14844 For example, write "0x" for the "%#x" format. */
14845 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14846 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14847 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14848 if (fill != ' ') {
14849 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14850 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14851 writer->pos += 2;
14852 pindex += 2;
14853 }
14854 arg->width -= 2;
14855 if (arg->width < 0)
14856 arg->width = 0;
14857 len -= 2;
14858 }
14859
14860 /* Pad left with the fill character if needed */
14861 if (arg->width > len && !(arg->flags & F_LJUST)) {
14862 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014863 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014864 writer->pos += sublen;
14865 arg->width = len;
14866 }
14867
14868 /* If padding with spaces: write sign if needed and/or numeric prefix if
14869 the alternate form is used */
14870 if (fill == ' ') {
14871 if (arg->sign) {
14872 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14873 writer->pos += 1;
14874 }
14875 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14876 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14877 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14878 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14879 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14880 writer->pos += 2;
14881 pindex += 2;
14882 }
14883 }
14884
14885 /* Write characters */
14886 if (len) {
14887 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14888 str, pindex, len);
14889 writer->pos += len;
14890 }
14891
14892 /* Pad right with the fill character if needed */
14893 if (arg->width > len) {
14894 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014895 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014896 writer->pos += sublen;
14897 }
14898 return 0;
14899}
14900
14901/* Helper of PyUnicode_Format(): format one arg.
14902 Return 0 on success, raise an exception and return -1 on error. */
14903static int
14904unicode_format_arg(struct unicode_formatter_t *ctx)
14905{
14906 struct unicode_format_arg_t arg;
14907 PyObject *str;
14908 int ret;
14909
Victor Stinner8dbd4212012-12-04 09:30:24 +010014910 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014911 if (arg.ch == '%') {
14912 ctx->fmtpos++;
14913 ctx->fmtcnt--;
14914 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14915 return -1;
14916 return 0;
14917 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014918 arg.flags = 0;
14919 arg.width = -1;
14920 arg.prec = -1;
14921 arg.sign = 0;
14922 str = NULL;
14923
Victor Stinnera47082312012-10-04 02:19:54 +020014924 ret = unicode_format_arg_parse(ctx, &arg);
14925 if (ret == -1)
14926 return -1;
14927
14928 ret = unicode_format_arg_format(ctx, &arg, &str);
14929 if (ret == -1)
14930 return -1;
14931
14932 if (ret != 1) {
14933 ret = unicode_format_arg_output(ctx, &arg, str);
14934 Py_DECREF(str);
14935 if (ret == -1)
14936 return -1;
14937 }
14938
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014939 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014940 PyErr_SetString(PyExc_TypeError,
14941 "not all arguments converted during string formatting");
14942 return -1;
14943 }
14944 return 0;
14945}
14946
Alexander Belopolsky40018472011-02-26 01:02:56 +000014947PyObject *
14948PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949{
Victor Stinnera47082312012-10-04 02:19:54 +020014950 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014951
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014953 PyErr_BadInternalCall();
14954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014955 }
Victor Stinnera47082312012-10-04 02:19:54 +020014956
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014957 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014958 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014959
14960 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014961 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14962 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14963 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14964 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014965
Victor Stinner8f674cc2013-04-17 23:02:17 +020014966 _PyUnicodeWriter_Init(&ctx.writer);
14967 ctx.writer.min_length = ctx.fmtcnt + 100;
14968 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014969
Guido van Rossumd57fd912000-03-10 22:53:23 +000014970 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014971 ctx.arglen = PyTuple_Size(args);
14972 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973 }
14974 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014975 ctx.arglen = -1;
14976 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014977 }
Victor Stinnera47082312012-10-04 02:19:54 +020014978 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014979 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014980 ctx.dict = args;
14981 else
14982 ctx.dict = NULL;
14983 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014984
Victor Stinnera47082312012-10-04 02:19:54 +020014985 while (--ctx.fmtcnt >= 0) {
14986 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014987 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014988
14989 nonfmtpos = ctx.fmtpos++;
14990 while (ctx.fmtcnt >= 0 &&
14991 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14992 ctx.fmtpos++;
14993 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014994 }
Victor Stinnera47082312012-10-04 02:19:54 +020014995 if (ctx.fmtcnt < 0) {
14996 ctx.fmtpos--;
14997 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014998 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014999
Victor Stinnercfc4c132013-04-03 01:48:39 +020015000 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15001 nonfmtpos, ctx.fmtpos) < 0)
15002 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015003 }
15004 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015005 ctx.fmtpos++;
15006 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015007 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015008 }
15009 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015010
Victor Stinnera47082312012-10-04 02:19:54 +020015011 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015012 PyErr_SetString(PyExc_TypeError,
15013 "not all arguments converted during string formatting");
15014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015015 }
15016
Victor Stinnera47082312012-10-04 02:19:54 +020015017 if (ctx.args_owned) {
15018 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015019 }
Victor Stinnera47082312012-10-04 02:19:54 +020015020 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015021
Benjamin Peterson29060642009-01-31 22:14:21 +000015022 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015023 _PyUnicodeWriter_Dealloc(&ctx.writer);
15024 if (ctx.args_owned) {
15025 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015026 }
15027 return NULL;
15028}
15029
Jeremy Hylton938ace62002-07-17 16:30:39 +000015030static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015031unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15032
Tim Peters6d6c1a32001-08-02 04:15:00 +000015033static PyObject *
15034unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15035{
Benjamin Peterson29060642009-01-31 22:14:21 +000015036 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 static char *kwlist[] = {"object", "encoding", "errors", 0};
15038 char *encoding = NULL;
15039 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015040
Benjamin Peterson14339b62009-01-31 16:36:08 +000015041 if (type != &PyUnicode_Type)
15042 return unicode_subtype_new(type, args, kwds);
15043 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015044 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 return NULL;
15046 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015047 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015048 if (encoding == NULL && errors == NULL)
15049 return PyObject_Str(x);
15050 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015051 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015052}
15053
Guido van Rossume023fe02001-08-30 03:12:59 +000015054static PyObject *
15055unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15056{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015057 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 Py_ssize_t length, char_size;
15059 int share_wstr, share_utf8;
15060 unsigned int kind;
15061 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015062
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015064
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015065 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015066 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015068 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015069 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015070 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015071 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015072 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015073
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015074 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 if (self == NULL) {
15076 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 return NULL;
15078 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079 kind = PyUnicode_KIND(unicode);
15080 length = PyUnicode_GET_LENGTH(unicode);
15081
15082 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015083#ifdef Py_DEBUG
15084 _PyUnicode_HASH(self) = -1;
15085#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015086 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015087#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015088 _PyUnicode_STATE(self).interned = 0;
15089 _PyUnicode_STATE(self).kind = kind;
15090 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015091 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015092 _PyUnicode_STATE(self).ready = 1;
15093 _PyUnicode_WSTR(self) = NULL;
15094 _PyUnicode_UTF8_LENGTH(self) = 0;
15095 _PyUnicode_UTF8(self) = NULL;
15096 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015097 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015098
15099 share_utf8 = 0;
15100 share_wstr = 0;
15101 if (kind == PyUnicode_1BYTE_KIND) {
15102 char_size = 1;
15103 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15104 share_utf8 = 1;
15105 }
15106 else if (kind == PyUnicode_2BYTE_KIND) {
15107 char_size = 2;
15108 if (sizeof(wchar_t) == 2)
15109 share_wstr = 1;
15110 }
15111 else {
15112 assert(kind == PyUnicode_4BYTE_KIND);
15113 char_size = 4;
15114 if (sizeof(wchar_t) == 4)
15115 share_wstr = 1;
15116 }
15117
15118 /* Ensure we won't overflow the length. */
15119 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15120 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015121 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015123 data = PyObject_MALLOC((length + 1) * char_size);
15124 if (data == NULL) {
15125 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015126 goto onError;
15127 }
15128
Victor Stinnerc3c74152011-10-02 20:39:55 +020015129 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015130 if (share_utf8) {
15131 _PyUnicode_UTF8_LENGTH(self) = length;
15132 _PyUnicode_UTF8(self) = data;
15133 }
15134 if (share_wstr) {
15135 _PyUnicode_WSTR_LENGTH(self) = length;
15136 _PyUnicode_WSTR(self) = (wchar_t *)data;
15137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015138
Christian Heimesf051e432016-09-13 20:22:02 +020015139 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015140 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015141 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015142#ifdef Py_DEBUG
15143 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15144#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015145 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015146 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015147
15148onError:
15149 Py_DECREF(unicode);
15150 Py_DECREF(self);
15151 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015152}
15153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015154PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015155"str(object='') -> str\n\
15156str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015157\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015158Create a new string object from the given object. If encoding or\n\
15159errors is specified, then the object must expose a data buffer\n\
15160that will be decoded using the given encoding and error handler.\n\
15161Otherwise, returns the result of object.__str__() (if defined)\n\
15162or repr(object).\n\
15163encoding defaults to sys.getdefaultencoding().\n\
15164errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015165
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015166static PyObject *unicode_iter(PyObject *seq);
15167
Guido van Rossumd57fd912000-03-10 22:53:23 +000015168PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015169 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015170 "str", /* tp_name */
15171 sizeof(PyUnicodeObject), /* tp_basicsize */
15172 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015173 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015174 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015175 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015176 0, /* tp_getattr */
15177 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015178 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015179 unicode_repr, /* tp_repr */
15180 &unicode_as_number, /* tp_as_number */
15181 &unicode_as_sequence, /* tp_as_sequence */
15182 &unicode_as_mapping, /* tp_as_mapping */
15183 (hashfunc) unicode_hash, /* tp_hash*/
15184 0, /* tp_call*/
15185 (reprfunc) unicode_str, /* tp_str */
15186 PyObject_GenericGetAttr, /* tp_getattro */
15187 0, /* tp_setattro */
15188 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015189 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015190 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15191 unicode_doc, /* tp_doc */
15192 0, /* tp_traverse */
15193 0, /* tp_clear */
15194 PyUnicode_RichCompare, /* tp_richcompare */
15195 0, /* tp_weaklistoffset */
15196 unicode_iter, /* tp_iter */
15197 0, /* tp_iternext */
15198 unicode_methods, /* tp_methods */
15199 0, /* tp_members */
15200 0, /* tp_getset */
15201 &PyBaseObject_Type, /* tp_base */
15202 0, /* tp_dict */
15203 0, /* tp_descr_get */
15204 0, /* tp_descr_set */
15205 0, /* tp_dictoffset */
15206 0, /* tp_init */
15207 0, /* tp_alloc */
15208 unicode_new, /* tp_new */
15209 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015210};
15211
15212/* Initialize the Unicode implementation */
15213
Victor Stinner331a6a52019-05-27 16:39:22 +020015214PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015215_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015216{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015217 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015218 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015219 0x000A, /* LINE FEED */
15220 0x000D, /* CARRIAGE RETURN */
15221 0x001C, /* FILE SEPARATOR */
15222 0x001D, /* GROUP SEPARATOR */
15223 0x001E, /* RECORD SEPARATOR */
15224 0x0085, /* NEXT LINE */
15225 0x2028, /* LINE SEPARATOR */
15226 0x2029, /* PARAGRAPH SEPARATOR */
15227 };
15228
Fred Drakee4315f52000-05-09 19:53:39 +000015229 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015230 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015231 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015232 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015233 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015234 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015235
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015236 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015237 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015238 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015239
15240 /* initialize the linebreak bloom filter */
15241 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015242 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015243 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015244
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015245 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015246 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015247 }
15248 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015249 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015250 }
15251 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015252 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015253 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015254 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255}
15256
15257/* Finalize the Unicode implementation */
15258
Christian Heimesa156e092008-02-16 07:38:31 +000015259int
15260PyUnicode_ClearFreeList(void)
15261{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015262 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015263}
15264
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015265
Walter Dörwald16807132007-05-25 13:52:07 +000015266void
15267PyUnicode_InternInPlace(PyObject **p)
15268{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015269 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015270 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015271#ifdef Py_DEBUG
15272 assert(s != NULL);
15273 assert(_PyUnicode_CHECK(s));
15274#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015276 return;
15277#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 /* If it's a subclass, we don't really know what putting
15279 it in the interned dict might do. */
15280 if (!PyUnicode_CheckExact(s))
15281 return;
15282 if (PyUnicode_CHECK_INTERNED(s))
15283 return;
15284 if (interned == NULL) {
15285 interned = PyDict_New();
15286 if (interned == NULL) {
15287 PyErr_Clear(); /* Don't leave an exception */
15288 return;
15289 }
15290 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015291 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015292 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015293 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015294 if (t == NULL) {
15295 PyErr_Clear();
15296 return;
15297 }
15298 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015299 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015300 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015301 return;
15302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 /* The two references in interned are not counted by refcnt.
15304 The deallocator will take care of this */
15305 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015306 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015307}
15308
15309void
15310PyUnicode_InternImmortal(PyObject **p)
15311{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 PyUnicode_InternInPlace(p);
15313 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015314 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015315 Py_INCREF(*p);
15316 }
Walter Dörwald16807132007-05-25 13:52:07 +000015317}
15318
15319PyObject *
15320PyUnicode_InternFromString(const char *cp)
15321{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 PyObject *s = PyUnicode_FromString(cp);
15323 if (s == NULL)
15324 return NULL;
15325 PyUnicode_InternInPlace(&s);
15326 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015327}
15328
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015329
15330#if defined(WITH_VALGRIND) || defined(__INSURE__)
15331static void
15332unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015333{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015335 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 Py_ssize_t i, n;
15337 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015338
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 if (interned == NULL || !PyDict_Check(interned))
15340 return;
15341 keys = PyDict_Keys(interned);
15342 if (keys == NULL || !PyList_Check(keys)) {
15343 PyErr_Clear();
15344 return;
15345 }
Walter Dörwald16807132007-05-25 13:52:07 +000015346
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015347 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 detector, interned unicode strings are not forcibly deallocated;
15349 rather, we give them their stolen references back, and then clear
15350 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015351
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015353#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015355 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015356#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015358 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015359 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015360 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015362 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 case SSTATE_NOT_INTERNED:
15364 /* XXX Shouldn't happen */
15365 break;
15366 case SSTATE_INTERNED_IMMORTAL:
15367 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015368 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 break;
15370 case SSTATE_INTERNED_MORTAL:
15371 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015372 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 break;
15374 default:
15375 Py_FatalError("Inconsistent interned string state.");
15376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015377 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015379#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 fprintf(stderr, "total size of all interned strings: "
15381 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15382 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015383#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015384 Py_DECREF(keys);
15385 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015386 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015387}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015388#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015389
15390
15391/********************* Unicode Iterator **************************/
15392
15393typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015394 PyObject_HEAD
15395 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015396 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015397} unicodeiterobject;
15398
15399static void
15400unicodeiter_dealloc(unicodeiterobject *it)
15401{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 _PyObject_GC_UNTRACK(it);
15403 Py_XDECREF(it->it_seq);
15404 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405}
15406
15407static int
15408unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 Py_VISIT(it->it_seq);
15411 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015412}
15413
15414static PyObject *
15415unicodeiter_next(unicodeiterobject *it)
15416{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015417 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015418
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 assert(it != NULL);
15420 seq = it->it_seq;
15421 if (seq == NULL)
15422 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015423 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015425 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15426 int kind = PyUnicode_KIND(seq);
15427 void *data = PyUnicode_DATA(seq);
15428 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15429 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 if (item != NULL)
15431 ++it->it_index;
15432 return item;
15433 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015434
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015436 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015438}
15439
15440static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015441unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 Py_ssize_t len = 0;
15444 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015445 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015447}
15448
15449PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15450
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015452unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015453{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015454 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015455 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015456 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457 it->it_seq, it->it_index);
15458 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015459 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 if (u == NULL)
15461 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015462 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015463 }
15464}
15465
15466PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15467
15468static PyObject *
15469unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15470{
15471 Py_ssize_t index = PyLong_AsSsize_t(state);
15472 if (index == -1 && PyErr_Occurred())
15473 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015474 if (it->it_seq != NULL) {
15475 if (index < 0)
15476 index = 0;
15477 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15478 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15479 it->it_index = index;
15480 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015481 Py_RETURN_NONE;
15482}
15483
15484PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15485
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015486static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015488 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015489 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15490 reduce_doc},
15491 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15492 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015493 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015494};
15495
15496PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015497 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15498 "str_iterator", /* tp_name */
15499 sizeof(unicodeiterobject), /* tp_basicsize */
15500 0, /* tp_itemsize */
15501 /* methods */
15502 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015503 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 0, /* tp_getattr */
15505 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015506 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015507 0, /* tp_repr */
15508 0, /* tp_as_number */
15509 0, /* tp_as_sequence */
15510 0, /* tp_as_mapping */
15511 0, /* tp_hash */
15512 0, /* tp_call */
15513 0, /* tp_str */
15514 PyObject_GenericGetAttr, /* tp_getattro */
15515 0, /* tp_setattro */
15516 0, /* tp_as_buffer */
15517 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15518 0, /* tp_doc */
15519 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15520 0, /* tp_clear */
15521 0, /* tp_richcompare */
15522 0, /* tp_weaklistoffset */
15523 PyObject_SelfIter, /* tp_iter */
15524 (iternextfunc)unicodeiter_next, /* tp_iternext */
15525 unicodeiter_methods, /* tp_methods */
15526 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015527};
15528
15529static PyObject *
15530unicode_iter(PyObject *seq)
15531{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015532 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015533
Benjamin Peterson14339b62009-01-31 16:36:08 +000015534 if (!PyUnicode_Check(seq)) {
15535 PyErr_BadInternalCall();
15536 return NULL;
15537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015538 if (PyUnicode_READY(seq) == -1)
15539 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015540 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15541 if (it == NULL)
15542 return NULL;
15543 it->it_index = 0;
15544 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015545 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015546 _PyObject_GC_TRACK(it);
15547 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015548}
15549
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015550
15551size_t
15552Py_UNICODE_strlen(const Py_UNICODE *u)
15553{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015554 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015555}
15556
15557Py_UNICODE*
15558Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15559{
15560 Py_UNICODE *u = s1;
15561 while ((*u++ = *s2++));
15562 return s1;
15563}
15564
15565Py_UNICODE*
15566Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15567{
15568 Py_UNICODE *u = s1;
15569 while ((*u++ = *s2++))
15570 if (n-- == 0)
15571 break;
15572 return s1;
15573}
15574
15575Py_UNICODE*
15576Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15577{
15578 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015579 u1 += wcslen(u1);
15580 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015581 return s1;
15582}
15583
15584int
15585Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15586{
15587 while (*s1 && *s2 && *s1 == *s2)
15588 s1++, s2++;
15589 if (*s1 && *s2)
15590 return (*s1 < *s2) ? -1 : +1;
15591 if (*s1)
15592 return 1;
15593 if (*s2)
15594 return -1;
15595 return 0;
15596}
15597
15598int
15599Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15600{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015601 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015602 for (; n != 0; n--) {
15603 u1 = *s1;
15604 u2 = *s2;
15605 if (u1 != u2)
15606 return (u1 < u2) ? -1 : +1;
15607 if (u1 == '\0')
15608 return 0;
15609 s1++;
15610 s2++;
15611 }
15612 return 0;
15613}
15614
15615Py_UNICODE*
15616Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15617{
15618 const Py_UNICODE *p;
15619 for (p = s; *p; p++)
15620 if (*p == c)
15621 return (Py_UNICODE*)p;
15622 return NULL;
15623}
15624
15625Py_UNICODE*
15626Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15627{
15628 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015629 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015630 while (p != s) {
15631 p--;
15632 if (*p == c)
15633 return (Py_UNICODE*)p;
15634 }
15635 return NULL;
15636}
Victor Stinner331ea922010-08-10 16:37:20 +000015637
Victor Stinner71133ff2010-09-01 23:43:53 +000015638Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015639PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015640{
Victor Stinner577db2c2011-10-11 22:12:48 +020015641 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015642 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015644 if (!PyUnicode_Check(unicode)) {
15645 PyErr_BadArgument();
15646 return NULL;
15647 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015648 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015649 if (u == NULL)
15650 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015651 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015652 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015653 PyErr_NoMemory();
15654 return NULL;
15655 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015656 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015657 size *= sizeof(Py_UNICODE);
15658 copy = PyMem_Malloc(size);
15659 if (copy == NULL) {
15660 PyErr_NoMemory();
15661 return NULL;
15662 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015663 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015664 return copy;
15665}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015666
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015667
Victor Stinner709d23d2019-05-02 14:56:30 -040015668static int
15669encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015670{
Victor Stinner709d23d2019-05-02 14:56:30 -040015671 int res;
15672 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15673 if (res == -2) {
15674 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15675 return -1;
15676 }
15677 if (res < 0) {
15678 PyErr_NoMemory();
15679 return -1;
15680 }
15681 return 0;
15682}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015683
Victor Stinner709d23d2019-05-02 14:56:30 -040015684
15685static int
15686config_get_codec_name(wchar_t **config_encoding)
15687{
15688 char *encoding;
15689 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15690 return -1;
15691 }
15692
15693 PyObject *name_obj = NULL;
15694 PyObject *codec = _PyCodec_Lookup(encoding);
15695 PyMem_RawFree(encoding);
15696
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015697 if (!codec)
15698 goto error;
15699
15700 name_obj = PyObject_GetAttrString(codec, "name");
15701 Py_CLEAR(codec);
15702 if (!name_obj) {
15703 goto error;
15704 }
15705
Victor Stinner709d23d2019-05-02 14:56:30 -040015706 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15707 Py_DECREF(name_obj);
15708 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015709 goto error;
15710 }
15711
Victor Stinner709d23d2019-05-02 14:56:30 -040015712 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15713 if (raw_wname == NULL) {
15714 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015715 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015716 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015717 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015718
15719 PyMem_RawFree(*config_encoding);
15720 *config_encoding = raw_wname;
15721
15722 PyMem_Free(wname);
15723 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015724
15725error:
15726 Py_XDECREF(codec);
15727 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015728 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015729}
15730
15731
Victor Stinner331a6a52019-05-27 16:39:22 +020015732static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015733init_stdio_encoding(PyInterpreterState *interp)
15734{
Victor Stinner709d23d2019-05-02 14:56:30 -040015735 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015736 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015737 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015738 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015739 "of the stdio encoding");
15740 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015741 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015742}
15743
15744
Victor Stinner709d23d2019-05-02 14:56:30 -040015745static int
15746init_fs_codec(PyInterpreterState *interp)
15747{
Victor Stinner331a6a52019-05-27 16:39:22 +020015748 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015749
15750 _Py_error_handler error_handler;
15751 error_handler = get_error_handler_wide(config->filesystem_errors);
15752 if (error_handler == _Py_ERROR_UNKNOWN) {
15753 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15754 return -1;
15755 }
15756
15757 char *encoding, *errors;
15758 if (encode_wstr_utf8(config->filesystem_encoding,
15759 &encoding,
15760 "filesystem_encoding") < 0) {
15761 return -1;
15762 }
15763
15764 if (encode_wstr_utf8(config->filesystem_errors,
15765 &errors,
15766 "filesystem_errors") < 0) {
15767 PyMem_RawFree(encoding);
15768 return -1;
15769 }
15770
15771 PyMem_RawFree(interp->fs_codec.encoding);
15772 interp->fs_codec.encoding = encoding;
15773 PyMem_RawFree(interp->fs_codec.errors);
15774 interp->fs_codec.errors = errors;
15775 interp->fs_codec.error_handler = error_handler;
15776
15777 /* At this point, PyUnicode_EncodeFSDefault() and
15778 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15779 the C implementation of the filesystem encoding. */
15780
15781 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15782 global configuration variables. */
15783 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15784 interp->fs_codec.errors) < 0) {
15785 PyErr_NoMemory();
15786 return -1;
15787 }
15788 return 0;
15789}
15790
15791
Victor Stinner331a6a52019-05-27 16:39:22 +020015792static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015793init_fs_encoding(PyInterpreterState *interp)
15794{
Victor Stinner709d23d2019-05-02 14:56:30 -040015795 /* Update the filesystem encoding to the normalized Python codec name.
15796 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15797 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015798 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015799 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015800 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015801 "of the filesystem encoding");
15802 }
15803
Victor Stinner709d23d2019-05-02 14:56:30 -040015804 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015805 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015806 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015807 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808}
15809
15810
Victor Stinner331a6a52019-05-27 16:39:22 +020015811PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015812_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015813{
Victor Stinnerb45d2592019-06-20 00:05:23 +020015814 PyInterpreterState *interp = tstate->interp;
15815
Victor Stinner331a6a52019-05-27 16:39:22 +020015816 PyStatus status = init_fs_encoding(interp);
15817 if (_PyStatus_EXCEPTION(status)) {
15818 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015819 }
15820
15821 return init_stdio_encoding(interp);
15822}
15823
15824
Victor Stinner709d23d2019-05-02 14:56:30 -040015825#ifdef MS_WINDOWS
15826int
15827_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15828{
15829 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015830 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015831
15832 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15833 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15834 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15835 if (encoding == NULL || errors == NULL) {
15836 PyMem_RawFree(encoding);
15837 PyMem_RawFree(errors);
15838 PyErr_NoMemory();
15839 return -1;
15840 }
15841
15842 PyMem_RawFree(config->filesystem_encoding);
15843 config->filesystem_encoding = encoding;
15844 PyMem_RawFree(config->filesystem_errors);
15845 config->filesystem_errors = errors;
15846
15847 return init_fs_codec(interp);
15848}
15849#endif
15850
15851
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015852void
15853_PyUnicode_Fini(void)
15854{
15855#if defined(WITH_VALGRIND) || defined(__INSURE__)
15856 /* Insure++ is a memory analysis tool that aids in discovering
15857 * memory leaks and other memory problems. On Python exit, the
15858 * interned string dictionaries are flagged as being in use at exit
15859 * (which it is). Under normal circumstances, this is fine because
15860 * the memory will be automatically reclaimed by the system. Under
15861 * memory debugging, it's a huge source of useless noise, so we
15862 * trade off slower shutdown for less distraction in the memory
15863 * reports. -baw
15864 */
15865 unicode_release_interned();
15866#endif /* __INSURE__ */
15867
15868 Py_CLEAR(unicode_empty);
15869
15870 for (Py_ssize_t i = 0; i < 256; i++) {
15871 Py_CLEAR(unicode_latin1[i]);
15872 }
15873 _PyUnicode_ClearStaticStrings();
15874 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015875
15876 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15877 PyMem_RawFree(interp->fs_codec.encoding);
15878 interp->fs_codec.encoding = NULL;
15879 PyMem_RawFree(interp->fs_codec.errors);
15880 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015881}
15882
15883
Georg Brandl66c221e2010-10-14 07:04:07 +000015884/* A _string module, to export formatter_parser and formatter_field_name_split
15885 to the string.Formatter class implemented in Python. */
15886
15887static PyMethodDef _string_methods[] = {
15888 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15889 METH_O, PyDoc_STR("split the argument as a field name")},
15890 {"formatter_parser", (PyCFunction) formatter_parser,
15891 METH_O, PyDoc_STR("parse the argument as a format string")},
15892 {NULL, NULL}
15893};
15894
15895static struct PyModuleDef _string_module = {
15896 PyModuleDef_HEAD_INIT,
15897 "_string",
15898 PyDoc_STR("string helper module"),
15899 0,
15900 _string_methods,
15901 NULL,
15902 NULL,
15903 NULL,
15904 NULL
15905};
15906
15907PyMODINIT_FUNC
15908PyInit__string(void)
15909{
15910 return PyModule_Create(&_string_module);
15911}
15912
15913
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015914#ifdef __cplusplus
15915}
15916#endif