blob: 5545eae79505a380426f8947f886d3a8f1e266bc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900268static inline void
269_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400270static PyObject *
271unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
272 const char *errors);
273static PyObject *
274unicode_decode_utf8(const char *s, Py_ssize_t size,
275 _Py_error_handler error_handler, const char *errors,
276 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200279static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281/* Single character Unicode strings in the Latin-1 range are being
282 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200283static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284
Christian Heimes190d79e2008-01-30 11:58:22 +0000285/* Fast detection of the most frequent whitespace characters */
286const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000C: * FORM FEED */
292/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 1, 1, 1, 1, 1, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x001C: * FILE SEPARATOR */
296/* case 0x001D: * GROUP SEPARATOR */
297/* case 0x001E: * RECORD SEPARATOR */
298/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 1, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200317static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100319static int unicode_modifiable(PyObject *unicode);
320
Victor Stinnerfe226c02011-10-03 03:52:20 +0200321
Alexander Belopolsky40018472011-02-26 01:02:56 +0000322static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100323_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200324static PyObject *
325_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
326static PyObject *
327_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
328
329static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000330unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100332 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
334
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335static void
336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100338 PyObject *unicode,
339 Py_ssize_t startpos, Py_ssize_t endpos,
340 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000341
Christian Heimes190d79e2008-01-30 11:58:22 +0000342/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200343static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000345/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000346/* 0x000B, * LINE TABULATION */
347/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x001C, * FILE SEPARATOR */
352/* 0x001D, * GROUP SEPARATOR */
353/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 1, 1, 1, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000359
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000368};
369
INADA Naoki3ae20562017-01-16 20:41:20 +0900370static int convert_uc(PyObject *obj, void *addr);
371
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300372#include "clinic/unicodeobject.c.h"
373
Victor Stinner3d4226a2018-08-29 22:21:32 +0200374_Py_error_handler
375_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200376{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200378 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 }
380 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200387 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_OTHER;
399}
400
Victor Stinner709d23d2019-05-02 14:56:30 -0400401
402static _Py_error_handler
403get_error_handler_wide(const wchar_t *errors)
404{
405 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
406 return _Py_ERROR_STRICT;
407 }
408 if (wcscmp(errors, L"surrogateescape") == 0) {
409 return _Py_ERROR_SURROGATEESCAPE;
410 }
411 if (wcscmp(errors, L"replace") == 0) {
412 return _Py_ERROR_REPLACE;
413 }
414 if (wcscmp(errors, L"ignore") == 0) {
415 return _Py_ERROR_IGNORE;
416 }
417 if (wcscmp(errors, L"backslashreplace") == 0) {
418 return _Py_ERROR_BACKSLASHREPLACE;
419 }
420 if (wcscmp(errors, L"surrogatepass") == 0) {
421 return _Py_ERROR_SURROGATEPASS;
422 }
423 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
424 return _Py_ERROR_XMLCHARREFREPLACE;
425 }
426 return _Py_ERROR_OTHER;
427}
428
429
Victor Stinner22eb6892019-06-26 00:51:05 +0200430static inline int
431unicode_check_encoding_errors(const char *encoding, const char *errors)
432{
433 if (encoding == NULL && errors == NULL) {
434 return 0;
435 }
436
437 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
438#ifndef Py_DEBUG
439 /* In release mode, only check in development mode (-X dev) */
440 if (!interp->config.dev_mode) {
441 return 0;
442 }
443#else
444 /* Always check in debug mode */
445#endif
446
447 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
448 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
449 if (!interp->fs_codec.encoding) {
450 return 0;
451 }
452
453 if (encoding != NULL) {
454 PyObject *handler = _PyCodec_Lookup(encoding);
455 if (handler == NULL) {
456 return -1;
457 }
458 Py_DECREF(handler);
459 }
460
461 if (errors != NULL) {
462 PyObject *handler = PyCodec_LookupError(errors);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468 return 0;
469}
470
471
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300472/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
473 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000474Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000475PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000476{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000477#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000478 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000479#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000480 /* This is actually an illegal character, so it should
481 not be passed to unichr. */
482 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000483#endif
484}
485
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200486int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100487_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200488{
489 PyASCIIObject *ascii;
490 unsigned int kind;
491
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200492 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200493
494 ascii = (PyASCIIObject *)op;
495 kind = ascii->state.kind;
496
Victor Stinnera3b334d2011-10-03 13:53:37 +0200497 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200498 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
499 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200500 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200501 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200502 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200503 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200504
Victor Stinnera41463c2011-10-04 01:05:08 +0200505 if (ascii->state.compact == 1) {
506 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200507 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
508 || kind == PyUnicode_2BYTE_KIND
509 || kind == PyUnicode_4BYTE_KIND);
510 _PyObject_ASSERT(op, ascii->state.ascii == 0);
511 _PyObject_ASSERT(op, ascii->state.ready == 1);
512 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100513 }
514 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200515 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
516
517 data = unicode->data.any;
518 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200519 _PyObject_ASSERT(op, ascii->length == 0);
520 _PyObject_ASSERT(op, ascii->hash == -1);
521 _PyObject_ASSERT(op, ascii->state.compact == 0);
522 _PyObject_ASSERT(op, ascii->state.ascii == 0);
523 _PyObject_ASSERT(op, ascii->state.ready == 0);
524 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
525 _PyObject_ASSERT(op, ascii->wstr != NULL);
526 _PyObject_ASSERT(op, data == NULL);
527 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200528 }
529 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200530 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
531 || kind == PyUnicode_2BYTE_KIND
532 || kind == PyUnicode_4BYTE_KIND);
533 _PyObject_ASSERT(op, ascii->state.compact == 0);
534 _PyObject_ASSERT(op, ascii->state.ready == 1);
535 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200536 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, compact->utf8 == data);
538 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200539 }
540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 }
543 }
544 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200545 if (
546#if SIZEOF_WCHAR_T == 2
547 kind == PyUnicode_2BYTE_KIND
548#else
549 kind == PyUnicode_4BYTE_KIND
550#endif
551 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200552 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200553 _PyObject_ASSERT(op, ascii->wstr == data);
554 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200555 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200556 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200557 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200558
559 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200560 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200562 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200564
565 /* check that the best kind is used: O(n) operation */
566 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 Py_ssize_t i;
568 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200569 void *data;
570 Py_UCS4 ch;
571
572 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 for (i=0; i < ascii->length; i++)
574 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200575 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200576 if (ch > maxchar)
577 maxchar = ch;
578 }
579 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100580 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200581 _PyObject_ASSERT(op, maxchar >= 128);
582 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100583 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200584 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200585 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200586 }
Victor Stinner77faf692011-11-20 18:56:05 +0100587 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200588 _PyObject_ASSERT(op, maxchar >= 0x100);
589 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100590 }
591 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200592 _PyObject_ASSERT(op, maxchar >= 0x10000);
593 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100594 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200595 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400597 return 1;
598}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200599
Victor Stinner910337b2011-10-03 03:20:16 +0200600
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100601static PyObject*
602unicode_result_wchar(PyObject *unicode)
603{
604#ifndef Py_DEBUG
605 Py_ssize_t len;
606
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100607 len = _PyUnicode_WSTR_LENGTH(unicode);
608 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100609 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200610 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100611 }
612
613 if (len == 1) {
614 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100615 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
617 Py_DECREF(unicode);
618 return latin1_char;
619 }
620 }
621
622 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200623 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 return NULL;
625 }
626#else
Victor Stinneraa771272012-10-04 02:32:58 +0200627 assert(Py_REFCNT(unicode) == 1);
628
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100629 /* don't make the result ready in debug mode to ensure that the caller
630 makes the string ready before using it */
631 assert(_PyUnicode_CheckConsistency(unicode, 1));
632#endif
633 return unicode;
634}
635
636static PyObject*
637unicode_result_ready(PyObject *unicode)
638{
639 Py_ssize_t length;
640
641 length = PyUnicode_GET_LENGTH(unicode);
642 if (length == 0) {
643 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100644 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200645 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100646 }
647 return unicode_empty;
648 }
649
650 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200651 void *data = PyUnicode_DATA(unicode);
652 int kind = PyUnicode_KIND(unicode);
653 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 if (ch < 256) {
655 PyObject *latin1_char = unicode_latin1[ch];
656 if (latin1_char != NULL) {
657 if (unicode != latin1_char) {
658 Py_INCREF(latin1_char);
659 Py_DECREF(unicode);
660 }
661 return latin1_char;
662 }
663 else {
664 assert(_PyUnicode_CheckConsistency(unicode, 1));
665 Py_INCREF(unicode);
666 unicode_latin1[ch] = unicode;
667 return unicode;
668 }
669 }
670 }
671
672 assert(_PyUnicode_CheckConsistency(unicode, 1));
673 return unicode;
674}
675
676static PyObject*
677unicode_result(PyObject *unicode)
678{
679 assert(_PyUnicode_CHECK(unicode));
680 if (PyUnicode_IS_READY(unicode))
681 return unicode_result_ready(unicode);
682 else
683 return unicode_result_wchar(unicode);
684}
685
Victor Stinnerc4b49542011-12-11 22:44:26 +0100686static PyObject*
687unicode_result_unchanged(PyObject *unicode)
688{
689 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500690 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100691 return NULL;
692 Py_INCREF(unicode);
693 return unicode;
694 }
695 else
696 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100697 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698}
699
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200700/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
701 ASCII, Latin1, UTF-8, etc. */
702static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200703backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200704 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
705{
Victor Stinnerad771582015-10-09 12:38:53 +0200706 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707 Py_UCS4 ch;
708 enum PyUnicode_Kind kind;
709 void *data;
710
711 assert(PyUnicode_IS_READY(unicode));
712 kind = PyUnicode_KIND(unicode);
713 data = PyUnicode_DATA(unicode);
714
715 size = 0;
716 /* determine replacement size */
717 for (i = collstart; i < collend; ++i) {
718 Py_ssize_t incr;
719
720 ch = PyUnicode_READ(kind, data, i);
721 if (ch < 0x100)
722 incr = 2+2;
723 else if (ch < 0x10000)
724 incr = 2+4;
725 else {
726 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200727 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200728 }
729 if (size > PY_SSIZE_T_MAX - incr) {
730 PyErr_SetString(PyExc_OverflowError,
731 "encoded result is too long for a Python string");
732 return NULL;
733 }
734 size += incr;
735 }
736
Victor Stinnerad771582015-10-09 12:38:53 +0200737 str = _PyBytesWriter_Prepare(writer, str, size);
738 if (str == NULL)
739 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200740
741 /* generate replacement */
742 for (i = collstart; i < collend; ++i) {
743 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200744 *str++ = '\\';
745 if (ch >= 0x00010000) {
746 *str++ = 'U';
747 *str++ = Py_hexdigits[(ch>>28)&0xf];
748 *str++ = Py_hexdigits[(ch>>24)&0xf];
749 *str++ = Py_hexdigits[(ch>>20)&0xf];
750 *str++ = Py_hexdigits[(ch>>16)&0xf];
751 *str++ = Py_hexdigits[(ch>>12)&0xf];
752 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200753 }
Victor Stinner797485e2015-10-09 03:17:30 +0200754 else if (ch >= 0x100) {
755 *str++ = 'u';
756 *str++ = Py_hexdigits[(ch>>12)&0xf];
757 *str++ = Py_hexdigits[(ch>>8)&0xf];
758 }
759 else
760 *str++ = 'x';
761 *str++ = Py_hexdigits[(ch>>4)&0xf];
762 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200763 }
764 return str;
765}
766
767/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
768 ASCII, Latin1, UTF-8, etc. */
769static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200770xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
772{
Victor Stinnerad771582015-10-09 12:38:53 +0200773 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200774 Py_UCS4 ch;
775 enum PyUnicode_Kind kind;
776 void *data;
777
778 assert(PyUnicode_IS_READY(unicode));
779 kind = PyUnicode_KIND(unicode);
780 data = PyUnicode_DATA(unicode);
781
782 size = 0;
783 /* determine replacement size */
784 for (i = collstart; i < collend; ++i) {
785 Py_ssize_t incr;
786
787 ch = PyUnicode_READ(kind, data, i);
788 if (ch < 10)
789 incr = 2+1+1;
790 else if (ch < 100)
791 incr = 2+2+1;
792 else if (ch < 1000)
793 incr = 2+3+1;
794 else if (ch < 10000)
795 incr = 2+4+1;
796 else if (ch < 100000)
797 incr = 2+5+1;
798 else if (ch < 1000000)
799 incr = 2+6+1;
800 else {
801 assert(ch <= MAX_UNICODE);
802 incr = 2+7+1;
803 }
804 if (size > PY_SSIZE_T_MAX - incr) {
805 PyErr_SetString(PyExc_OverflowError,
806 "encoded result is too long for a Python string");
807 return NULL;
808 }
809 size += incr;
810 }
811
Victor Stinnerad771582015-10-09 12:38:53 +0200812 str = _PyBytesWriter_Prepare(writer, str, size);
813 if (str == NULL)
814 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200815
816 /* generate replacement */
817 for (i = collstart; i < collend; ++i) {
818 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
819 }
820 return str;
821}
822
Thomas Wouters477c8d52006-05-27 19:21:47 +0000823/* --- Bloom Filters ----------------------------------------------------- */
824
825/* stuff to implement simple "bloom filters" for Unicode characters.
826 to keep things simple, we use a single bitmask, using the least 5
827 bits from each unicode characters as the bit index. */
828
829/* the linebreak mask is set up by Unicode_Init below */
830
Antoine Pitrouf068f942010-01-13 14:19:12 +0000831#if LONG_BIT >= 128
832#define BLOOM_WIDTH 128
833#elif LONG_BIT >= 64
834#define BLOOM_WIDTH 64
835#elif LONG_BIT >= 32
836#define BLOOM_WIDTH 32
837#else
838#error "LONG_BIT is smaller than 32"
839#endif
840
Thomas Wouters477c8d52006-05-27 19:21:47 +0000841#define BLOOM_MASK unsigned long
842
Serhiy Storchaka05997252013-01-26 12:14:02 +0200843static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844
Antoine Pitrouf068f942010-01-13 14:19:12 +0000845#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846
Benjamin Peterson29060642009-01-31 22:14:21 +0000847#define BLOOM_LINEBREAK(ch) \
848 ((ch) < 128U ? ascii_linebreak[(ch)] : \
849 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000850
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700851static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853{
Victor Stinnera85af502013-04-09 21:53:54 +0200854#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
855 do { \
856 TYPE *data = (TYPE *)PTR; \
857 TYPE *end = data + LEN; \
858 Py_UCS4 ch; \
859 for (; data != end; data++) { \
860 ch = *data; \
861 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
862 } \
863 break; \
864 } while (0)
865
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866 /* calculate simple bloom-style bitmask for a given unicode string */
867
Antoine Pitrouf068f942010-01-13 14:19:12 +0000868 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869
870 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200871 switch (kind) {
872 case PyUnicode_1BYTE_KIND:
873 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
874 break;
875 case PyUnicode_2BYTE_KIND:
876 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
877 break;
878 case PyUnicode_4BYTE_KIND:
879 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
880 break;
881 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700882 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200883 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200885
886#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000887}
888
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300889static int
890ensure_unicode(PyObject *obj)
891{
892 if (!PyUnicode_Check(obj)) {
893 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200894 "must be str, not %.100s",
895 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896 return -1;
897 }
898 return PyUnicode_READY(obj);
899}
900
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200901/* Compilation of templated routines */
902
903#include "stringlib/asciilib.h"
904#include "stringlib/fastsearch.h"
905#include "stringlib/partition.h"
906#include "stringlib/split.h"
907#include "stringlib/count.h"
908#include "stringlib/find.h"
909#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200910#include "stringlib/undef.h"
911
912#include "stringlib/ucs1lib.h"
913#include "stringlib/fastsearch.h"
914#include "stringlib/partition.h"
915#include "stringlib/split.h"
916#include "stringlib/count.h"
917#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300918#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200919#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200920#include "stringlib/undef.h"
921
922#include "stringlib/ucs2lib.h"
923#include "stringlib/fastsearch.h"
924#include "stringlib/partition.h"
925#include "stringlib/split.h"
926#include "stringlib/count.h"
927#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300928#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200929#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200930#include "stringlib/undef.h"
931
932#include "stringlib/ucs4lib.h"
933#include "stringlib/fastsearch.h"
934#include "stringlib/partition.h"
935#include "stringlib/split.h"
936#include "stringlib/count.h"
937#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300938#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200939#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200940#include "stringlib/undef.h"
941
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200942#include "stringlib/unicodedefs.h"
943#include "stringlib/fastsearch.h"
944#include "stringlib/count.h"
945#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100946#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200947
Guido van Rossumd57fd912000-03-10 22:53:23 +0000948/* --- Unicode Object ----------------------------------------------------- */
949
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700950static inline Py_ssize_t
951findchar(const void *s, int kind,
952 Py_ssize_t size, Py_UCS4 ch,
953 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200955 switch (kind) {
956 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200957 if ((Py_UCS1) ch != ch)
958 return -1;
959 if (direction > 0)
960 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
961 else
962 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200963 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS2) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
968 else
969 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if (direction > 0)
972 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
973 else
974 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200975 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700976 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978}
979
Victor Stinnerafffce42012-10-03 23:03:17 +0200980#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000981/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200982 earlier.
983
984 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
985 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
986 invalid character in Unicode 6.0. */
987static void
988unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
989{
990 int kind = PyUnicode_KIND(unicode);
991 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
992 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
993 if (length <= old_length)
994 return;
995 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
996}
997#endif
998
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999static PyObject*
1000resize_compact(PyObject *unicode, Py_ssize_t length)
1001{
1002 Py_ssize_t char_size;
1003 Py_ssize_t struct_size;
1004 Py_ssize_t new_size;
1005 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001006 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001007#ifdef Py_DEBUG
1008 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1009#endif
1010
Victor Stinner79891572012-05-03 13:43:07 +02001011 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001012 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001013 assert(PyUnicode_IS_COMPACT(unicode));
1014
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001015 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001016 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017 struct_size = sizeof(PyASCIIObject);
1018 else
1019 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001020 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1023 PyErr_NoMemory();
1024 return NULL;
1025 }
1026 new_size = (struct_size + (length + 1) * char_size);
1027
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001028 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1029 PyObject_DEL(_PyUnicode_UTF8(unicode));
1030 _PyUnicode_UTF8(unicode) = NULL;
1031 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1032 }
Victor Stinner84def372011-12-11 20:04:56 +01001033 _Py_DEC_REFTOTAL;
1034 _Py_ForgetReference(unicode);
1035
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001036 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001037 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001038 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 PyErr_NoMemory();
1040 return NULL;
1041 }
Victor Stinner84def372011-12-11 20:04:56 +01001042 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001044
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001046 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001047 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001048 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001049 _PyUnicode_WSTR_LENGTH(unicode) = length;
1050 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001051 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1052 PyObject_DEL(_PyUnicode_WSTR(unicode));
1053 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001054 if (!PyUnicode_IS_ASCII(unicode))
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001056 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001057#ifdef Py_DEBUG
1058 unicode_fill_invalid(unicode, old_length);
1059#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001060 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1061 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001062 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 return unicode;
1064}
1065
Alexander Belopolsky40018472011-02-26 01:02:56 +00001066static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001067resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068{
Victor Stinner95663112011-10-04 01:03:50 +02001069 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001070 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001073
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 if (PyUnicode_IS_READY(unicode)) {
1075 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001076 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001078#ifdef Py_DEBUG
1079 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1080#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081
1082 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001083 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001084 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1085 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001086
1087 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1088 PyErr_NoMemory();
1089 return -1;
1090 }
1091 new_size = (length + 1) * char_size;
1092
Victor Stinner7a9105a2011-12-12 00:13:42 +01001093 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1094 {
1095 PyObject_DEL(_PyUnicode_UTF8(unicode));
1096 _PyUnicode_UTF8(unicode) = NULL;
1097 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1098 }
1099
Victor Stinnerfe226c02011-10-03 03:52:20 +02001100 data = (PyObject *)PyObject_REALLOC(data, new_size);
1101 if (data == NULL) {
1102 PyErr_NoMemory();
1103 return -1;
1104 }
1105 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001106 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001108 _PyUnicode_WSTR_LENGTH(unicode) = length;
1109 }
1110 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001111 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001112 _PyUnicode_UTF8_LENGTH(unicode) = length;
1113 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001114 _PyUnicode_LENGTH(unicode) = length;
1115 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001116#ifdef Py_DEBUG
1117 unicode_fill_invalid(unicode, old_length);
1118#endif
Victor Stinner95663112011-10-04 01:03:50 +02001119 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001120 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001123 }
Victor Stinner95663112011-10-04 01:03:50 +02001124 assert(_PyUnicode_WSTR(unicode) != NULL);
1125
1126 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001127 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001128 PyErr_NoMemory();
1129 return -1;
1130 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001131 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001132 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001133 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001134 if (!wstr) {
1135 PyErr_NoMemory();
1136 return -1;
1137 }
1138 _PyUnicode_WSTR(unicode) = wstr;
1139 _PyUnicode_WSTR(unicode)[length] = 0;
1140 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001141 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001142 return 0;
1143}
1144
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145static PyObject*
1146resize_copy(PyObject *unicode, Py_ssize_t length)
1147{
1148 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001149 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001150 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001152 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001153
1154 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1155 if (copy == NULL)
1156 return NULL;
1157
1158 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001159 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001161 }
1162 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001163 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001164
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001165 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001166 if (w == NULL)
1167 return NULL;
1168 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1169 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001170 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001171 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001172 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001173 }
1174}
1175
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001177 Ux0000 terminated; some code (e.g. new_identifier)
1178 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
1180 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001181 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183*/
1184
Alexander Belopolsky40018472011-02-26 01:02:56 +00001185static PyUnicodeObject *
1186_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001188 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
Thomas Wouters477c8d52006-05-27 19:21:47 +00001191 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 if (length == 0 && unicode_empty != NULL) {
1193 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001194 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 }
1196
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001197 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001198 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001199 return (PyUnicodeObject *)PyErr_NoMemory();
1200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201 if (length < 0) {
1202 PyErr_SetString(PyExc_SystemError,
1203 "Negative size passed to _PyUnicode_New");
1204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 }
1206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1208 if (unicode == NULL)
1209 return NULL;
1210 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001211
1212 _PyUnicode_WSTR_LENGTH(unicode) = length;
1213 _PyUnicode_HASH(unicode) = -1;
1214 _PyUnicode_STATE(unicode).interned = 0;
1215 _PyUnicode_STATE(unicode).kind = 0;
1216 _PyUnicode_STATE(unicode).compact = 0;
1217 _PyUnicode_STATE(unicode).ready = 0;
1218 _PyUnicode_STATE(unicode).ascii = 0;
1219 _PyUnicode_DATA_ANY(unicode) = NULL;
1220 _PyUnicode_LENGTH(unicode) = 0;
1221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1225 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001226 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001227 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001228 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230
Jeremy Hyltond8082792003-09-16 19:41:39 +00001231 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001232 * the caller fails before initializing str -- unicode_resize()
1233 * reads str[0], and the Keep-Alive optimization can keep memory
1234 * allocated for str alive across a call to unicode_dealloc(unicode).
1235 * We don't want unicode_resize to read uninitialized memory in
1236 * that case.
1237 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 _PyUnicode_WSTR(unicode)[0] = 0;
1239 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001240
Victor Stinner7931d9a2011-11-04 00:22:48 +01001241 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 return unicode;
1243}
1244
Victor Stinnerf42dc442011-10-02 23:33:16 +02001245static const char*
1246unicode_kind_name(PyObject *unicode)
1247{
Victor Stinner42dfd712011-10-03 14:41:45 +02001248 /* don't check consistency: unicode_kind_name() is called from
1249 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001250 if (!PyUnicode_IS_COMPACT(unicode))
1251 {
1252 if (!PyUnicode_IS_READY(unicode))
1253 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001254 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001255 {
1256 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001257 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001258 return "legacy ascii";
1259 else
1260 return "legacy latin1";
1261 case PyUnicode_2BYTE_KIND:
1262 return "legacy UCS2";
1263 case PyUnicode_4BYTE_KIND:
1264 return "legacy UCS4";
1265 default:
1266 return "<legacy invalid kind>";
1267 }
1268 }
1269 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001270 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001272 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001273 return "ascii";
1274 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001275 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001277 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001278 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001279 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280 default:
1281 return "<invalid compact kind>";
1282 }
1283}
1284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001287char *_PyUnicode_utf8(void *unicode_raw){
1288 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001289 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290}
1291
Victor Stinnera42de742018-11-22 10:25:22 +01001292void *_PyUnicode_compact_data(void *unicode_raw) {
1293 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294 return _PyUnicode_COMPACT_DATA(unicode);
1295}
Victor Stinnera42de742018-11-22 10:25:22 +01001296void *_PyUnicode_data(void *unicode_raw) {
1297 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001298 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1300 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1301 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1302 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1303 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1304 return PyUnicode_DATA(unicode);
1305}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001306
1307void
1308_PyUnicode_Dump(PyObject *op)
1309{
1310 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1312 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1313 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001314
Victor Stinnera849a4b2011-10-03 12:12:11 +02001315 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001316 {
1317 if (ascii->state.ascii)
1318 data = (ascii + 1);
1319 else
1320 data = (compact + 1);
1321 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 else
1323 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001324 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1325 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001326
Victor Stinnera849a4b2011-10-03 12:12:11 +02001327 if (ascii->wstr == data)
1328 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001329 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001330
Victor Stinnera3b334d2011-10-03 13:53:37 +02001331 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001332 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1334 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001336 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001337 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001339}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
1342PyObject *
1343PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1344{
1345 PyObject *obj;
1346 PyCompactUnicodeObject *unicode;
1347 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001348 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001349 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 Py_ssize_t char_size;
1351 Py_ssize_t struct_size;
1352
1353 /* Optimization for empty strings */
1354 if (size == 0 && unicode_empty != NULL) {
1355 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001356 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 }
1358
Victor Stinner9e9d6892011-10-04 01:02:02 +02001359 is_ascii = 0;
1360 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 struct_size = sizeof(PyCompactUnicodeObject);
1362 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001363 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 char_size = 1;
1365 is_ascii = 1;
1366 struct_size = sizeof(PyASCIIObject);
1367 }
1368 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001369 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 char_size = 1;
1371 }
1372 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001373 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001374 char_size = 2;
1375 if (sizeof(wchar_t) == 2)
1376 is_sharing = 1;
1377 }
1378 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001379 if (maxchar > MAX_UNICODE) {
1380 PyErr_SetString(PyExc_SystemError,
1381 "invalid maximum character passed to PyUnicode_New");
1382 return NULL;
1383 }
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 4;
1386 if (sizeof(wchar_t) == 4)
1387 is_sharing = 1;
1388 }
1389
1390 /* Ensure we won't overflow the size. */
1391 if (size < 0) {
1392 PyErr_SetString(PyExc_SystemError,
1393 "Negative size passed to PyUnicode_New");
1394 return NULL;
1395 }
1396 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1397 return PyErr_NoMemory();
1398
1399 /* Duplicated allocation code from _PyObject_New() instead of a call to
1400 * PyObject_New() so we are able to allocate space for the object and
1401 * it's data buffer.
1402 */
1403 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1404 if (obj == NULL)
1405 return PyErr_NoMemory();
1406 obj = PyObject_INIT(obj, &PyUnicode_Type);
1407 if (obj == NULL)
1408 return NULL;
1409
1410 unicode = (PyCompactUnicodeObject *)obj;
1411 if (is_ascii)
1412 data = ((PyASCIIObject*)obj) + 1;
1413 else
1414 data = unicode + 1;
1415 _PyUnicode_LENGTH(unicode) = size;
1416 _PyUnicode_HASH(unicode) = -1;
1417 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001418 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 _PyUnicode_STATE(unicode).compact = 1;
1420 _PyUnicode_STATE(unicode).ready = 1;
1421 _PyUnicode_STATE(unicode).ascii = is_ascii;
1422 if (is_ascii) {
1423 ((char*)data)[size] = 0;
1424 _PyUnicode_WSTR(unicode) = NULL;
1425 }
Victor Stinner8f825062012-04-27 13:55:39 +02001426 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 ((char*)data)[size] = 0;
1428 _PyUnicode_WSTR(unicode) = NULL;
1429 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001431 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 else {
1434 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001435 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001436 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001438 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 ((Py_UCS4*)data)[size] = 0;
1440 if (is_sharing) {
1441 _PyUnicode_WSTR_LENGTH(unicode) = size;
1442 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1443 }
1444 else {
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 }
1448 }
Victor Stinner8f825062012-04-27 13:55:39 +02001449#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001450 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001451#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001452 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001453 return obj;
1454}
1455
1456#if SIZEOF_WCHAR_T == 2
1457/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1458 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001459 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460
1461 This function assumes that unicode can hold one more code point than wstr
1462 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001463static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001465 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466{
1467 const wchar_t *iter;
1468 Py_UCS4 *ucs4_out;
1469
Victor Stinner910337b2011-10-03 03:20:16 +02001470 assert(unicode != NULL);
1471 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1473 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1474
1475 for (iter = begin; iter < end; ) {
1476 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1477 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001478 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1479 && (iter+1) < end
1480 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 {
Victor Stinner551ac952011-11-29 22:58:13 +01001482 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 iter += 2;
1484 }
1485 else {
1486 *ucs4_out++ = *iter;
1487 iter++;
1488 }
1489 }
1490 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1491 _PyUnicode_GET_LENGTH(unicode)));
1492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493}
1494#endif
1495
Victor Stinnercd9950f2011-10-02 00:34:53 +02001496static int
Victor Stinner488fa492011-12-12 00:01:39 +01001497unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001498{
Victor Stinner488fa492011-12-12 00:01:39 +01001499 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001500 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001501 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001502 return -1;
1503 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001504 return 0;
1505}
1506
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001507static int
1508_copy_characters(PyObject *to, Py_ssize_t to_start,
1509 PyObject *from, Py_ssize_t from_start,
1510 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001512 unsigned int from_kind, to_kind;
1513 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514
Victor Stinneree4544c2012-05-09 22:24:08 +02001515 assert(0 <= how_many);
1516 assert(0 <= from_start);
1517 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001519 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001520 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521
Victor Stinnerd3f08822012-05-29 12:57:52 +02001522 assert(PyUnicode_Check(to));
1523 assert(PyUnicode_IS_READY(to));
1524 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1525
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001526 if (how_many == 0)
1527 return 0;
1528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001532 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001533
Victor Stinnerf1852262012-06-16 16:38:26 +02001534#ifdef Py_DEBUG
1535 if (!check_maxchar
1536 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1537 {
1538 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1539 Py_UCS4 ch;
1540 Py_ssize_t i;
1541 for (i=0; i < how_many; i++) {
1542 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1543 assert(ch <= to_maxchar);
1544 }
1545 }
1546#endif
1547
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001548 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001549 if (check_maxchar
1550 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1551 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001552 /* Writing Latin-1 characters into an ASCII string requires to
1553 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001554 Py_UCS4 max_char;
1555 max_char = ucs1lib_find_max_char(from_data,
1556 (Py_UCS1*)from_data + how_many);
1557 if (max_char >= 128)
1558 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 }
Christian Heimesf051e432016-09-13 20:22:02 +02001560 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001561 (char*)from_data + from_kind * from_start,
1562 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001564 else if (from_kind == PyUnicode_1BYTE_KIND
1565 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS1, Py_UCS2,
1569 PyUnicode_1BYTE_DATA(from) + from_start,
1570 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_2BYTE_DATA(to) + to_start
1572 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001573 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001574 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001575 && to_kind == PyUnicode_4BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS1, Py_UCS4,
1579 PyUnicode_1BYTE_DATA(from) + from_start,
1580 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_4BYTE_DATA(to) + to_start
1582 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001583 }
1584 else if (from_kind == PyUnicode_2BYTE_KIND
1585 && to_kind == PyUnicode_4BYTE_KIND)
1586 {
1587 _PyUnicode_CONVERT_BYTES(
1588 Py_UCS2, Py_UCS4,
1589 PyUnicode_2BYTE_DATA(from) + from_start,
1590 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1591 PyUnicode_4BYTE_DATA(to) + to_start
1592 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001593 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001594 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1596
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001597 if (!check_maxchar) {
1598 if (from_kind == PyUnicode_2BYTE_KIND
1599 && to_kind == PyUnicode_1BYTE_KIND)
1600 {
1601 _PyUnicode_CONVERT_BYTES(
1602 Py_UCS2, Py_UCS1,
1603 PyUnicode_2BYTE_DATA(from) + from_start,
1604 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1605 PyUnicode_1BYTE_DATA(to) + to_start
1606 );
1607 }
1608 else if (from_kind == PyUnicode_4BYTE_KIND
1609 && to_kind == PyUnicode_1BYTE_KIND)
1610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS4, Py_UCS1,
1613 PyUnicode_4BYTE_DATA(from) + from_start,
1614 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_1BYTE_DATA(to) + to_start
1616 );
1617 }
1618 else if (from_kind == PyUnicode_4BYTE_KIND
1619 && to_kind == PyUnicode_2BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS4, Py_UCS2,
1623 PyUnicode_4BYTE_DATA(from) + from_start,
1624 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_2BYTE_DATA(to) + to_start
1626 );
1627 }
1628 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001629 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001630 }
1631 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001632 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001633 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001634 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001635 Py_ssize_t i;
1636
Victor Stinnera0702ab2011-09-29 14:14:38 +02001637 for (i=0; i < how_many; i++) {
1638 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 if (ch > to_maxchar)
1640 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001641 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1642 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001643 }
1644 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 return 0;
1646}
1647
Victor Stinnerd3f08822012-05-29 12:57:52 +02001648void
1649_PyUnicode_FastCopyCharacters(
1650 PyObject *to, Py_ssize_t to_start,
1651 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652{
1653 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1654}
1655
1656Py_ssize_t
1657PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1658 PyObject *from, Py_ssize_t from_start,
1659 Py_ssize_t how_many)
1660{
1661 int err;
1662
1663 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1664 PyErr_BadInternalCall();
1665 return -1;
1666 }
1667
Benjamin Petersonbac79492012-01-14 13:34:47 -05001668 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001669 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001670 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001671 return -1;
1672
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001673 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001674 PyErr_SetString(PyExc_IndexError, "string index out of range");
1675 return -1;
1676 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001677 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001678 PyErr_SetString(PyExc_IndexError, "string index out of range");
1679 return -1;
1680 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001681 if (how_many < 0) {
1682 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1683 return -1;
1684 }
1685 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001686 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1687 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001688 "Cannot write %zi characters at %zi "
1689 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001690 how_many, to_start, PyUnicode_GET_LENGTH(to));
1691 return -1;
1692 }
1693
1694 if (how_many == 0)
1695 return 0;
1696
Victor Stinner488fa492011-12-12 00:01:39 +01001697 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001698 return -1;
1699
1700 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1701 if (err) {
1702 PyErr_Format(PyExc_SystemError,
1703 "Cannot copy %s characters "
1704 "into a string of %s characters",
1705 unicode_kind_name(from),
1706 unicode_kind_name(to));
1707 return -1;
1708 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001709 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710}
1711
Victor Stinner17222162011-09-28 22:15:37 +02001712/* Find the maximum code point and count the number of surrogate pairs so a
1713 correct string length can be computed before converting a string to UCS4.
1714 This function counts single surrogates as a character and not as a pair.
1715
1716 Return 0 on success, or -1 on error. */
1717static int
1718find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1719 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720{
1721 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001722 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723
Victor Stinnerc53be962011-10-02 21:33:54 +02001724 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725 *num_surrogates = 0;
1726 *maxchar = 0;
1727
1728 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001730 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1731 && (iter+1) < end
1732 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1733 {
1734 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1735 ++(*num_surrogates);
1736 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 }
1738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001740 {
1741 ch = *iter;
1742 iter++;
1743 }
1744 if (ch > *maxchar) {
1745 *maxchar = ch;
1746 if (*maxchar > MAX_UNICODE) {
1747 PyErr_Format(PyExc_ValueError,
1748 "character U+%x is not in range [U+0000; U+10ffff]",
1749 ch);
1750 return -1;
1751 }
1752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 }
1754 return 0;
1755}
1756
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001757int
1758_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759{
1760 wchar_t *end;
1761 Py_UCS4 maxchar = 0;
1762 Py_ssize_t num_surrogates;
1763#if SIZEOF_WCHAR_T == 2
1764 Py_ssize_t length_wo_surrogates;
1765#endif
1766
Georg Brandl7597add2011-10-05 16:36:47 +02001767 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001768 strings were created using _PyObject_New() and where no canonical
1769 representation (the str field) has been set yet aka strings
1770 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001771 assert(_PyUnicode_CHECK(unicode));
1772 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001775 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001776 /* Actually, it should neither be interned nor be anything else: */
1777 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001780 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001781 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783
1784 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1786 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 PyErr_NoMemory();
1788 return -1;
1789 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001790 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 _PyUnicode_WSTR(unicode), end,
1792 PyUnicode_1BYTE_DATA(unicode));
1793 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1794 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1795 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1796 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001797 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001798 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001799 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001802 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001803 _PyUnicode_UTF8(unicode) = NULL;
1804 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 }
1806 PyObject_FREE(_PyUnicode_WSTR(unicode));
1807 _PyUnicode_WSTR(unicode) = NULL;
1808 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1809 }
1810 /* In this case we might have to convert down from 4-byte native
1811 wchar_t to 2-byte unicode. */
1812 else if (maxchar < 65536) {
1813 assert(num_surrogates == 0 &&
1814 "FindMaxCharAndNumSurrogatePairs() messed up");
1815
Victor Stinner506f5922011-09-28 22:34:18 +02001816#if SIZEOF_WCHAR_T == 2
1817 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001819 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1820 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1821 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001822 _PyUnicode_UTF8(unicode) = NULL;
1823 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001824#else
1825 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001827 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001828 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001829 PyErr_NoMemory();
1830 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 }
Victor Stinner506f5922011-09-28 22:34:18 +02001832 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1833 _PyUnicode_WSTR(unicode), end,
1834 PyUnicode_2BYTE_DATA(unicode));
1835 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1836 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1837 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001838 _PyUnicode_UTF8(unicode) = NULL;
1839 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyObject_FREE(_PyUnicode_WSTR(unicode));
1841 _PyUnicode_WSTR(unicode) = NULL;
1842 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1843#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1846 else {
1847#if SIZEOF_WCHAR_T == 2
1848 /* in case the native representation is 2-bytes, we need to allocate a
1849 new normalized 4-byte version. */
1850 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001851 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1852 PyErr_NoMemory();
1853 return -1;
1854 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001855 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1856 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 PyErr_NoMemory();
1858 return -1;
1859 }
1860 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1861 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001862 _PyUnicode_UTF8(unicode) = NULL;
1863 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001864 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1865 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001866 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 PyObject_FREE(_PyUnicode_WSTR(unicode));
1868 _PyUnicode_WSTR(unicode) = NULL;
1869 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1870#else
1871 assert(num_surrogates == 0);
1872
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001875 _PyUnicode_UTF8(unicode) = NULL;
1876 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1878#endif
1879 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1880 }
1881 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001882 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 return 0;
1884}
1885
Alexander Belopolsky40018472011-02-26 01:02:56 +00001886static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001887unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888{
Walter Dörwald16807132007-05-25 13:52:07 +00001889 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001890 case SSTATE_NOT_INTERNED:
1891 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001892
Benjamin Peterson29060642009-01-31 22:14:21 +00001893 case SSTATE_INTERNED_MORTAL:
1894 /* revive dead object temporarily for DelItem */
1895 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001896 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 Py_FatalError(
1898 "deletion of interned string failed");
1899 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001900
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_INTERNED_IMMORTAL:
1902 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001903 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001904
Benjamin Peterson29060642009-01-31 22:14:21 +00001905 default:
1906 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001907 }
1908
Victor Stinner03490912011-10-03 23:45:12 +02001909 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001911 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001912 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001913 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1914 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001916 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917}
1918
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001919#ifdef Py_DEBUG
1920static int
1921unicode_is_singleton(PyObject *unicode)
1922{
1923 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1924 if (unicode == unicode_empty)
1925 return 1;
1926 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1927 {
1928 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1929 if (ch < 256 && unicode_latin1[ch] == unicode)
1930 return 1;
1931 }
1932 return 0;
1933}
1934#endif
1935
Alexander Belopolsky40018472011-02-26 01:02:56 +00001936static int
Victor Stinner488fa492011-12-12 00:01:39 +01001937unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001938{
Victor Stinner488fa492011-12-12 00:01:39 +01001939 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 if (Py_REFCNT(unicode) != 1)
1941 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001942 if (_PyUnicode_HASH(unicode) != -1)
1943 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001944 if (PyUnicode_CHECK_INTERNED(unicode))
1945 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001946 if (!PyUnicode_CheckExact(unicode))
1947 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001948#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001949 /* singleton refcount is greater than 1 */
1950 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001951#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 1;
1953}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001954
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955static int
1956unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1957{
1958 PyObject *unicode;
1959 Py_ssize_t old_length;
1960
1961 assert(p_unicode != NULL);
1962 unicode = *p_unicode;
1963
1964 assert(unicode != NULL);
1965 assert(PyUnicode_Check(unicode));
1966 assert(0 <= length);
1967
Victor Stinner910337b2011-10-03 03:20:16 +02001968 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001969 old_length = PyUnicode_WSTR_LENGTH(unicode);
1970 else
1971 old_length = PyUnicode_GET_LENGTH(unicode);
1972 if (old_length == length)
1973 return 0;
1974
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001975 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001976 _Py_INCREF_UNICODE_EMPTY();
1977 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001978 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001979 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001980 return 0;
1981 }
1982
Victor Stinner488fa492011-12-12 00:01:39 +01001983 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 PyObject *copy = resize_copy(unicode, length);
1985 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001986 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001987 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001988 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001989 }
1990
Victor Stinnerfe226c02011-10-03 03:52:20 +02001991 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001992 PyObject *new_unicode = resize_compact(unicode, length);
1993 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001994 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001995 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001996 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001997 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001998 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999}
2000
Alexander Belopolsky40018472011-02-26 01:02:56 +00002001int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002002PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002003{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002004 PyObject *unicode;
2005 if (p_unicode == NULL) {
2006 PyErr_BadInternalCall();
2007 return -1;
2008 }
2009 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002010 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 {
2012 PyErr_BadInternalCall();
2013 return -1;
2014 }
2015 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002016}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002017
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002018/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002019
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002020 WARNING: The function doesn't copy the terminating null character and
2021 doesn't check the maximum character (may write a latin1 character in an
2022 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002023static void
2024unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2025 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002026{
2027 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2028 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002029 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002030
2031 switch (kind) {
2032 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002033 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002034#ifdef Py_DEBUG
2035 if (PyUnicode_IS_ASCII(unicode)) {
2036 Py_UCS4 maxchar = ucs1lib_find_max_char(
2037 (const Py_UCS1*)str,
2038 (const Py_UCS1*)str + len);
2039 assert(maxchar < 128);
2040 }
2041#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002042 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002043 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002044 }
2045 case PyUnicode_2BYTE_KIND: {
2046 Py_UCS2 *start = (Py_UCS2 *)data + index;
2047 Py_UCS2 *ucs2 = start;
2048 assert(index <= PyUnicode_GET_LENGTH(unicode));
2049
Victor Stinner184252a2012-06-16 02:57:41 +02002050 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002051 *ucs2 = (Py_UCS2)*str;
2052
2053 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002054 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002055 }
2056 default: {
2057 Py_UCS4 *start = (Py_UCS4 *)data + index;
2058 Py_UCS4 *ucs4 = start;
2059 assert(kind == PyUnicode_4BYTE_KIND);
2060 assert(index <= PyUnicode_GET_LENGTH(unicode));
2061
Victor Stinner184252a2012-06-16 02:57:41 +02002062 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002063 *ucs4 = (Py_UCS4)*str;
2064
2065 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 }
2067 }
2068}
2069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070static PyObject*
2071get_latin1_char(unsigned char ch)
2072{
Victor Stinnera464fc12011-10-02 20:39:30 +02002073 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002075 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002076 if (!unicode)
2077 return NULL;
2078 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002079 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 unicode_latin1[ch] = unicode;
2081 }
2082 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002083 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002084}
2085
Victor Stinner985a82a2014-01-03 12:53:47 +01002086static PyObject*
2087unicode_char(Py_UCS4 ch)
2088{
2089 PyObject *unicode;
2090
2091 assert(ch <= MAX_UNICODE);
2092
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002093 if (ch < 256)
2094 return get_latin1_char(ch);
2095
Victor Stinner985a82a2014-01-03 12:53:47 +01002096 unicode = PyUnicode_New(1, ch);
2097 if (unicode == NULL)
2098 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002099
2100 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2101 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002102 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002103 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002104 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2105 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2106 }
2107 assert(_PyUnicode_CheckConsistency(unicode, 1));
2108 return unicode;
2109}
2110
Alexander Belopolsky40018472011-02-26 01:02:56 +00002111PyObject *
2112PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002114 if (u == NULL)
2115 return (PyObject*)_PyUnicode_New(size);
2116
2117 if (size < 0) {
2118 PyErr_BadInternalCall();
2119 return NULL;
2120 }
2121
2122 return PyUnicode_FromWideChar(u, size);
2123}
2124
2125PyObject *
2126PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2127{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002128 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 Py_UCS4 maxchar = 0;
2130 Py_ssize_t num_surrogates;
2131
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002132 if (u == NULL && size != 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 if (size == -1) {
2138 size = wcslen(u);
2139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002141 /* If the Unicode data is known at construction time, we can apply
2142 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002145 if (size == 0)
2146 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 /* Single character Unicode objects in the Latin-1 range are
2149 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002150 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 return get_latin1_char((unsigned char)*u);
2152
2153 /* If not empty and not single character, copy the Unicode data
2154 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002155 if (find_maxchar_surrogates(u, u + size,
2156 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 return NULL;
2158
Victor Stinner8faf8212011-12-08 22:14:11 +01002159 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 if (!unicode)
2161 return NULL;
2162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 switch (PyUnicode_KIND(unicode)) {
2164 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002165 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2167 break;
2168 case PyUnicode_2BYTE_KIND:
2169#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002170 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002172 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2174#endif
2175 break;
2176 case PyUnicode_4BYTE_KIND:
2177#if SIZEOF_WCHAR_T == 2
2178 /* This is the only case which has to process surrogates, thus
2179 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002180 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181#else
2182 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002183 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184#endif
2185 break;
2186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002187 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002190 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002191}
2192
Alexander Belopolsky40018472011-02-26 01:02:56 +00002193PyObject *
2194PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002195{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002196 if (size < 0) {
2197 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002198 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002199 return NULL;
2200 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002201 if (u != NULL)
2202 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2203 else
2204 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002205}
2206
Alexander Belopolsky40018472011-02-26 01:02:56 +00002207PyObject *
2208PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002209{
2210 size_t size = strlen(u);
2211 if (size > PY_SSIZE_T_MAX) {
2212 PyErr_SetString(PyExc_OverflowError, "input too long");
2213 return NULL;
2214 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002215 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002216}
2217
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002218PyObject *
2219_PyUnicode_FromId(_Py_Identifier *id)
2220{
2221 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002222 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2223 strlen(id->string),
2224 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002225 if (!id->object)
2226 return NULL;
2227 PyUnicode_InternInPlace(&id->object);
2228 assert(!id->next);
2229 id->next = static_strings;
2230 static_strings = id;
2231 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002232 return id->object;
2233}
2234
2235void
2236_PyUnicode_ClearStaticStrings()
2237{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002238 _Py_Identifier *tmp, *s = static_strings;
2239 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002240 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002241 tmp = s->next;
2242 s->next = NULL;
2243 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002244 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002245 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002246}
2247
Benjamin Peterson0df54292012-03-26 14:50:32 -04002248/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002249
Victor Stinnerd3f08822012-05-29 12:57:52 +02002250PyObject*
2251_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002252{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002253 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002254 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002255 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002256#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002257 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002258#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002259 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002260 }
Victor Stinner785938e2011-12-11 20:09:03 +01002261 unicode = PyUnicode_New(size, 127);
2262 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002263 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002264 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2265 assert(_PyUnicode_CheckConsistency(unicode, 1));
2266 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002267}
2268
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002269static Py_UCS4
2270kind_maxchar_limit(unsigned int kind)
2271{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002272 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002273 case PyUnicode_1BYTE_KIND:
2274 return 0x80;
2275 case PyUnicode_2BYTE_KIND:
2276 return 0x100;
2277 case PyUnicode_4BYTE_KIND:
2278 return 0x10000;
2279 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002280 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002281 }
2282}
2283
Victor Stinner702c7342011-10-05 13:50:52 +02002284static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002285_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002287 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002288 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002289
Serhiy Storchaka678db842013-01-26 12:16:36 +02002290 if (size == 0)
2291 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002292 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002293 if (size == 1)
2294 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002296 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002297 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 if (!res)
2299 return NULL;
2300 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002301 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002303}
2304
Victor Stinnere57b1c02011-09-28 22:20:48 +02002305static PyObject*
2306_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307{
2308 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002309 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002310
Serhiy Storchaka678db842013-01-26 12:16:36 +02002311 if (size == 0)
2312 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002313 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002314 if (size == 1)
2315 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002317 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002318 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002319 if (!res)
2320 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002321 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002323 else {
2324 _PyUnicode_CONVERT_BYTES(
2325 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2326 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002327 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 return res;
2329}
2330
Victor Stinnere57b1c02011-09-28 22:20:48 +02002331static PyObject*
2332_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333{
2334 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002335 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336
Serhiy Storchaka678db842013-01-26 12:16:36 +02002337 if (size == 0)
2338 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002339 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002340 if (size == 1)
2341 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002342
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002343 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002344 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (!res)
2346 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002347 if (max_char < 256)
2348 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2349 PyUnicode_1BYTE_DATA(res));
2350 else if (max_char < 0x10000)
2351 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2352 PyUnicode_2BYTE_DATA(res));
2353 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002355 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 return res;
2357}
2358
2359PyObject*
2360PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2361{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002362 if (size < 0) {
2363 PyErr_SetString(PyExc_ValueError, "size must be positive");
2364 return NULL;
2365 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002366 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002368 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002370 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002372 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002373 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002374 PyErr_SetString(PyExc_SystemError, "invalid kind");
2375 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002377}
2378
Victor Stinnerece58de2012-04-23 23:36:38 +02002379Py_UCS4
2380_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2381{
2382 enum PyUnicode_Kind kind;
2383 void *startptr, *endptr;
2384
2385 assert(PyUnicode_IS_READY(unicode));
2386 assert(0 <= start);
2387 assert(end <= PyUnicode_GET_LENGTH(unicode));
2388 assert(start <= end);
2389
2390 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2391 return PyUnicode_MAX_CHAR_VALUE(unicode);
2392
2393 if (start == end)
2394 return 127;
2395
Victor Stinner94d558b2012-04-27 22:26:58 +02002396 if (PyUnicode_IS_ASCII(unicode))
2397 return 127;
2398
Victor Stinnerece58de2012-04-23 23:36:38 +02002399 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002400 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002401 endptr = (char *)startptr + end * kind;
2402 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002403 switch(kind) {
2404 case PyUnicode_1BYTE_KIND:
2405 return ucs1lib_find_max_char(startptr, endptr);
2406 case PyUnicode_2BYTE_KIND:
2407 return ucs2lib_find_max_char(startptr, endptr);
2408 case PyUnicode_4BYTE_KIND:
2409 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002410 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002411 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002412 }
2413}
2414
Victor Stinner25a4b292011-10-06 12:31:55 +02002415/* Ensure that a string uses the most efficient storage, if it is not the
2416 case: create a new string with of the right kind. Write NULL into *p_unicode
2417 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002418static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002419unicode_adjust_maxchar(PyObject **p_unicode)
2420{
2421 PyObject *unicode, *copy;
2422 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002423 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002424 unsigned int kind;
2425
2426 assert(p_unicode != NULL);
2427 unicode = *p_unicode;
2428 assert(PyUnicode_IS_READY(unicode));
2429 if (PyUnicode_IS_ASCII(unicode))
2430 return;
2431
2432 len = PyUnicode_GET_LENGTH(unicode);
2433 kind = PyUnicode_KIND(unicode);
2434 if (kind == PyUnicode_1BYTE_KIND) {
2435 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002436 max_char = ucs1lib_find_max_char(u, u + len);
2437 if (max_char >= 128)
2438 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 }
2440 else if (kind == PyUnicode_2BYTE_KIND) {
2441 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002442 max_char = ucs2lib_find_max_char(u, u + len);
2443 if (max_char >= 256)
2444 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002445 }
2446 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002448 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002449 max_char = ucs4lib_find_max_char(u, u + len);
2450 if (max_char >= 0x10000)
2451 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002452 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002453 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002454 if (copy != NULL)
2455 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002456 Py_DECREF(unicode);
2457 *p_unicode = copy;
2458}
2459
Victor Stinner034f6cf2011-09-30 02:26:44 +02002460PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002461_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002462{
Victor Stinner87af4f22011-11-21 23:03:47 +01002463 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002464 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002465
Victor Stinner034f6cf2011-09-30 02:26:44 +02002466 if (!PyUnicode_Check(unicode)) {
2467 PyErr_BadInternalCall();
2468 return NULL;
2469 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002470 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002471 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002472
Victor Stinner87af4f22011-11-21 23:03:47 +01002473 length = PyUnicode_GET_LENGTH(unicode);
2474 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002475 if (!copy)
2476 return NULL;
2477 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2478
Christian Heimesf051e432016-09-13 20:22:02 +02002479 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002480 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002481 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002482 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002483}
2484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485
Victor Stinnerbc603d12011-10-02 01:00:40 +02002486/* Widen Unicode objects to larger buffers. Don't write terminating null
2487 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488
2489void*
2490_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2491{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002492 Py_ssize_t len;
2493 void *result;
2494 unsigned int skind;
2495
Benjamin Petersonbac79492012-01-14 13:34:47 -05002496 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002497 return NULL;
2498
2499 len = PyUnicode_GET_LENGTH(s);
2500 skind = PyUnicode_KIND(s);
2501 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002502 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 return NULL;
2504 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002505 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002506 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002507 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002508 if (!result)
2509 return PyErr_NoMemory();
2510 assert(skind == PyUnicode_1BYTE_KIND);
2511 _PyUnicode_CONVERT_BYTES(
2512 Py_UCS1, Py_UCS2,
2513 PyUnicode_1BYTE_DATA(s),
2514 PyUnicode_1BYTE_DATA(s) + len,
2515 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002518 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002519 if (!result)
2520 return PyErr_NoMemory();
2521 if (skind == PyUnicode_2BYTE_KIND) {
2522 _PyUnicode_CONVERT_BYTES(
2523 Py_UCS2, Py_UCS4,
2524 PyUnicode_2BYTE_DATA(s),
2525 PyUnicode_2BYTE_DATA(s) + len,
2526 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002528 else {
2529 assert(skind == PyUnicode_1BYTE_KIND);
2530 _PyUnicode_CONVERT_BYTES(
2531 Py_UCS1, Py_UCS4,
2532 PyUnicode_1BYTE_DATA(s),
2533 PyUnicode_1BYTE_DATA(s) + len,
2534 result);
2535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002537 default:
2538 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 }
Victor Stinner01698042011-10-04 00:04:26 +02002540 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 return NULL;
2542}
2543
2544static Py_UCS4*
2545as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2546 int copy_null)
2547{
2548 int kind;
2549 void *data;
2550 Py_ssize_t len, targetlen;
2551 if (PyUnicode_READY(string) == -1)
2552 return NULL;
2553 kind = PyUnicode_KIND(string);
2554 data = PyUnicode_DATA(string);
2555 len = PyUnicode_GET_LENGTH(string);
2556 targetlen = len;
2557 if (copy_null)
2558 targetlen++;
2559 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002560 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 if (!target) {
2562 PyErr_NoMemory();
2563 return NULL;
2564 }
2565 }
2566 else {
2567 if (targetsize < targetlen) {
2568 PyErr_Format(PyExc_SystemError,
2569 "string is longer than the buffer");
2570 if (copy_null && 0 < targetsize)
2571 target[0] = 0;
2572 return NULL;
2573 }
2574 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002575 if (kind == PyUnicode_1BYTE_KIND) {
2576 Py_UCS1 *start = (Py_UCS1 *) data;
2577 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002579 else if (kind == PyUnicode_2BYTE_KIND) {
2580 Py_UCS2 *start = (Py_UCS2 *) data;
2581 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2582 }
2583 else {
2584 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002585 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 if (copy_null)
2588 target[len] = 0;
2589 return target;
2590}
2591
2592Py_UCS4*
2593PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2594 int copy_null)
2595{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002596 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 PyErr_BadInternalCall();
2598 return NULL;
2599 }
2600 return as_ucs4(string, target, targetsize, copy_null);
2601}
2602
2603Py_UCS4*
2604PyUnicode_AsUCS4Copy(PyObject *string)
2605{
2606 return as_ucs4(string, NULL, 0, 1);
2607}
2608
Victor Stinner15a11362012-10-06 23:48:20 +02002609/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002610 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2611 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2612#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002613
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614static int
2615unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2616 Py_ssize_t width, Py_ssize_t precision)
2617{
2618 Py_ssize_t length, fill, arglen;
2619 Py_UCS4 maxchar;
2620
2621 if (PyUnicode_READY(str) == -1)
2622 return -1;
2623
2624 length = PyUnicode_GET_LENGTH(str);
2625 if ((precision == -1 || precision >= length)
2626 && width <= length)
2627 return _PyUnicodeWriter_WriteStr(writer, str);
2628
2629 if (precision != -1)
2630 length = Py_MIN(precision, length);
2631
2632 arglen = Py_MAX(length, width);
2633 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2634 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2635 else
2636 maxchar = writer->maxchar;
2637
2638 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2639 return -1;
2640
2641 if (width > length) {
2642 fill = width - length;
2643 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2644 return -1;
2645 writer->pos += fill;
2646 }
2647
2648 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2649 str, 0, length);
2650 writer->pos += length;
2651 return 0;
2652}
2653
2654static int
Victor Stinner998b8062018-09-12 00:23:25 +02002655unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002656 Py_ssize_t width, Py_ssize_t precision)
2657{
2658 /* UTF-8 */
2659 Py_ssize_t length;
2660 PyObject *unicode;
2661 int res;
2662
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002663 if (precision == -1) {
2664 length = strlen(str);
2665 }
2666 else {
2667 length = 0;
2668 while (length < precision && str[length]) {
2669 length++;
2670 }
2671 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2673 if (unicode == NULL)
2674 return -1;
2675
2676 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2677 Py_DECREF(unicode);
2678 return res;
2679}
2680
Victor Stinner96865452011-03-01 23:44:09 +00002681static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002682unicode_fromformat_arg(_PyUnicodeWriter *writer,
2683 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002684{
Victor Stinnere215d962012-10-06 23:03:36 +02002685 const char *p;
2686 Py_ssize_t len;
2687 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002688 Py_ssize_t width;
2689 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002690 int longflag;
2691 int longlongflag;
2692 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002693 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 p = f;
2696 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002697 zeropad = 0;
2698 if (*f == '0') {
2699 zeropad = 1;
2700 f++;
2701 }
Victor Stinner96865452011-03-01 23:44:09 +00002702
2703 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 width = -1;
2705 if (Py_ISDIGIT((unsigned)*f)) {
2706 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002707 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002708 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002709 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002710 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002712 return NULL;
2713 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002714 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002715 f++;
2716 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 }
2718 precision = -1;
2719 if (*f == '.') {
2720 f++;
2721 if (Py_ISDIGIT((unsigned)*f)) {
2722 precision = (*f - '0');
2723 f++;
2724 while (Py_ISDIGIT((unsigned)*f)) {
2725 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2726 PyErr_SetString(PyExc_ValueError,
2727 "precision too big");
2728 return NULL;
2729 }
2730 precision = (precision * 10) + (*f - '0');
2731 f++;
2732 }
2733 }
Victor Stinner96865452011-03-01 23:44:09 +00002734 if (*f == '%') {
2735 /* "%.3%s" => f points to "3" */
2736 f--;
2737 }
2738 }
2739 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002740 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002741 f--;
2742 }
Victor Stinner96865452011-03-01 23:44:09 +00002743
2744 /* Handle %ld, %lu, %lld and %llu. */
2745 longflag = 0;
2746 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002747 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002748 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002749 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002750 longflag = 1;
2751 ++f;
2752 }
Victor Stinner96865452011-03-01 23:44:09 +00002753 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002754 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002755 longlongflag = 1;
2756 f += 2;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758 }
2759 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002761 size_tflag = 1;
2762 ++f;
2763 }
Victor Stinnere215d962012-10-06 23:03:36 +02002764
2765 if (f[1] == '\0')
2766 writer->overallocate = 0;
2767
2768 switch (*f) {
2769 case 'c':
2770 {
2771 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002772 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002773 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002774 "character argument not in range(0x110000)");
2775 return NULL;
2776 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002777 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002778 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002779 break;
2780 }
2781
2782 case 'i':
2783 case 'd':
2784 case 'u':
2785 case 'x':
2786 {
2787 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002788 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002789 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002790
2791 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002792 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002793 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002794 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002795 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002796 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002797 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002798 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002799 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002800 va_arg(*vargs, size_t));
2801 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002802 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002803 va_arg(*vargs, unsigned int));
2804 }
2805 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002806 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002807 }
2808 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002809 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002810 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002811 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002812 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002813 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002814 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002815 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002816 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002817 va_arg(*vargs, Py_ssize_t));
2818 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002819 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002820 va_arg(*vargs, int));
2821 }
2822 assert(len >= 0);
2823
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (precision < len)
2825 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002826
2827 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002828 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2829 return NULL;
2830
Victor Stinnere215d962012-10-06 23:03:36 +02002831 if (width > precision) {
2832 Py_UCS4 fillchar;
2833 fill = width - precision;
2834 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002835 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2836 return NULL;
2837 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 }
Victor Stinner15a11362012-10-06 23:48:20 +02002839 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002840 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002841 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2842 return NULL;
2843 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002844 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002845
Victor Stinner4a587072013-11-19 12:54:53 +01002846 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2847 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002848 break;
2849 }
2850
2851 case 'p':
2852 {
2853 char number[MAX_LONG_LONG_CHARS];
2854
2855 len = sprintf(number, "%p", va_arg(*vargs, void*));
2856 assert(len >= 0);
2857
2858 /* %p is ill-defined: ensure leading 0x. */
2859 if (number[1] == 'X')
2860 number[1] = 'x';
2861 else if (number[1] != 'x') {
2862 memmove(number + 2, number,
2863 strlen(number) + 1);
2864 number[0] = '0';
2865 number[1] = 'x';
2866 len += 2;
2867 }
2868
Victor Stinner4a587072013-11-19 12:54:53 +01002869 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002870 return NULL;
2871 break;
2872 }
2873
2874 case 's':
2875 {
2876 /* UTF-8 */
2877 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002878 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002879 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002880 break;
2881 }
2882
2883 case 'U':
2884 {
2885 PyObject *obj = va_arg(*vargs, PyObject *);
2886 assert(obj && _PyUnicode_CHECK(obj));
2887
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002888 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002889 return NULL;
2890 break;
2891 }
2892
2893 case 'V':
2894 {
2895 PyObject *obj = va_arg(*vargs, PyObject *);
2896 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002897 if (obj) {
2898 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002900 return NULL;
2901 }
2902 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002904 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002905 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002906 }
2907 break;
2908 }
2909
2910 case 'S':
2911 {
2912 PyObject *obj = va_arg(*vargs, PyObject *);
2913 PyObject *str;
2914 assert(obj);
2915 str = PyObject_Str(obj);
2916 if (!str)
2917 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002919 Py_DECREF(str);
2920 return NULL;
2921 }
2922 Py_DECREF(str);
2923 break;
2924 }
2925
2926 case 'R':
2927 {
2928 PyObject *obj = va_arg(*vargs, PyObject *);
2929 PyObject *repr;
2930 assert(obj);
2931 repr = PyObject_Repr(obj);
2932 if (!repr)
2933 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002934 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002935 Py_DECREF(repr);
2936 return NULL;
2937 }
2938 Py_DECREF(repr);
2939 break;
2940 }
2941
2942 case 'A':
2943 {
2944 PyObject *obj = va_arg(*vargs, PyObject *);
2945 PyObject *ascii;
2946 assert(obj);
2947 ascii = PyObject_ASCII(obj);
2948 if (!ascii)
2949 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 Py_DECREF(ascii);
2952 return NULL;
2953 }
2954 Py_DECREF(ascii);
2955 break;
2956 }
2957
2958 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002959 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002960 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002961 break;
2962
2963 default:
2964 /* if we stumble upon an unknown formatting code, copy the rest
2965 of the format string to the output string. (we cannot just
2966 skip the code, since there's no way to know what's in the
2967 argument list) */
2968 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002969 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002970 return NULL;
2971 f = p+len;
2972 return f;
2973 }
2974
2975 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002976 return f;
2977}
2978
Walter Dörwaldd2034312007-05-18 16:29:38 +00002979PyObject *
2980PyUnicode_FromFormatV(const char *format, va_list vargs)
2981{
Victor Stinnere215d962012-10-06 23:03:36 +02002982 va_list vargs2;
2983 const char *f;
2984 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002985
Victor Stinner8f674cc2013-04-17 23:02:17 +02002986 _PyUnicodeWriter_Init(&writer);
2987 writer.min_length = strlen(format) + 100;
2988 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002989
Benjamin Peterson0c212142016-09-20 20:39:33 -07002990 // Copy varags to be able to pass a reference to a subfunction.
2991 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002992
2993 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002994 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002995 f = unicode_fromformat_arg(&writer, f, &vargs2);
2996 if (f == NULL)
2997 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002999 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003000 const char *p;
3001 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002
Victor Stinnere215d962012-10-06 23:03:36 +02003003 p = f;
3004 do
3005 {
3006 if ((unsigned char)*p > 127) {
3007 PyErr_Format(PyExc_ValueError,
3008 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3009 "string, got a non-ASCII byte: 0x%02x",
3010 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003011 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003012 }
3013 p++;
3014 }
3015 while (*p != '\0' && *p != '%');
3016 len = p - f;
3017
3018 if (*p == '\0')
3019 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003020
3021 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003022 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003023
3024 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003026 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003027 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003028 return _PyUnicodeWriter_Finish(&writer);
3029
3030 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003031 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003032 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003033 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003034}
3035
Walter Dörwaldd2034312007-05-18 16:29:38 +00003036PyObject *
3037PyUnicode_FromFormat(const char *format, ...)
3038{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003039 PyObject* ret;
3040 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003041
3042#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003043 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003044#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003045 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003046#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003047 ret = PyUnicode_FromFormatV(format, vargs);
3048 va_end(vargs);
3049 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003050}
3051
Serhiy Storchakac46db922018-10-23 22:58:24 +03003052static Py_ssize_t
3053unicode_get_widechar_size(PyObject *unicode)
3054{
3055 Py_ssize_t res;
3056
3057 assert(unicode != NULL);
3058 assert(_PyUnicode_CHECK(unicode));
3059
3060 if (_PyUnicode_WSTR(unicode) != NULL) {
3061 return PyUnicode_WSTR_LENGTH(unicode);
3062 }
3063 assert(PyUnicode_IS_READY(unicode));
3064
3065 res = _PyUnicode_LENGTH(unicode);
3066#if SIZEOF_WCHAR_T == 2
3067 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3068 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3069 const Py_UCS4 *end = s + res;
3070 for (; s < end; ++s) {
3071 if (*s > 0xFFFF) {
3072 ++res;
3073 }
3074 }
3075 }
3076#endif
3077 return res;
3078}
3079
3080static void
3081unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3082{
3083 const wchar_t *wstr;
3084
3085 assert(unicode != NULL);
3086 assert(_PyUnicode_CHECK(unicode));
3087
3088 wstr = _PyUnicode_WSTR(unicode);
3089 if (wstr != NULL) {
3090 memcpy(w, wstr, size * sizeof(wchar_t));
3091 return;
3092 }
3093 assert(PyUnicode_IS_READY(unicode));
3094
3095 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3096 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3097 for (; size--; ++s, ++w) {
3098 *w = *s;
3099 }
3100 }
3101 else {
3102#if SIZEOF_WCHAR_T == 4
3103 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3104 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3105 for (; size--; ++s, ++w) {
3106 *w = *s;
3107 }
3108#else
3109 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3110 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3111 for (; size--; ++s, ++w) {
3112 Py_UCS4 ch = *s;
3113 if (ch > 0xFFFF) {
3114 assert(ch <= MAX_UNICODE);
3115 /* encode surrogate pair in this case */
3116 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3117 if (!size--)
3118 break;
3119 *w = Py_UNICODE_LOW_SURROGATE(ch);
3120 }
3121 else {
3122 *w = ch;
3123 }
3124 }
3125#endif
3126 }
3127}
3128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003129#ifdef HAVE_WCHAR_H
3130
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003131/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003132
Victor Stinnerd88d9832011-09-06 02:00:05 +02003133 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003134 character) required to convert the unicode object. Ignore size argument.
3135
Victor Stinnerd88d9832011-09-06 02:00:05 +02003136 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003137 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003138 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139Py_ssize_t
3140PyUnicode_AsWideChar(PyObject *unicode,
3141 wchar_t *w,
3142 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003143{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003144 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146 if (unicode == NULL) {
3147 PyErr_BadInternalCall();
3148 return -1;
3149 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003150 if (!PyUnicode_Check(unicode)) {
3151 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003153 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003154
3155 res = unicode_get_widechar_size(unicode);
3156 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003158 }
3159
3160 if (size > res) {
3161 size = res + 1;
3162 }
3163 else {
3164 res = size;
3165 }
3166 unicode_copy_as_widechar(unicode, w, size);
3167 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003168}
3169
Victor Stinner137c34c2010-09-29 10:25:54 +00003170wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003171PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003172 Py_ssize_t *size)
3173{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003174 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003175 Py_ssize_t buflen;
3176
3177 if (unicode == NULL) {
3178 PyErr_BadInternalCall();
3179 return NULL;
3180 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003181 if (!PyUnicode_Check(unicode)) {
3182 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003183 return NULL;
3184 }
3185
Serhiy Storchakac46db922018-10-23 22:58:24 +03003186 buflen = unicode_get_widechar_size(unicode);
3187 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003188 if (buffer == NULL) {
3189 PyErr_NoMemory();
3190 return NULL;
3191 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003192 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3193 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003194 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003195 }
3196 else if (wcslen(buffer) != (size_t)buflen) {
3197 PyMem_FREE(buffer);
3198 PyErr_SetString(PyExc_ValueError,
3199 "embedded null character");
3200 return NULL;
3201 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003202 return buffer;
3203}
3204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003205#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003209{
Victor Stinner8faf8212011-12-08 22:14:11 +01003210 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003211 PyErr_SetString(PyExc_ValueError,
3212 "chr() arg not in range(0x110000)");
3213 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003214 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003215
Victor Stinner985a82a2014-01-03 12:53:47 +01003216 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003217}
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003220PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003222 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003224 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003225 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003226 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 Py_INCREF(obj);
3228 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003229 }
3230 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 /* For a Unicode subtype that's not a Unicode object,
3232 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003233 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003234 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003235 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003236 "Can't convert '%.100s' object to str implicitly",
3237 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003238 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003239}
3240
Alexander Belopolsky40018472011-02-26 01:02:56 +00003241PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003242PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003243 const char *encoding,
3244 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003245{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003246 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003247 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003250 PyErr_BadInternalCall();
3251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003253
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003254 /* Decoding bytes objects is the most common case and should be fast */
3255 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003256 if (PyBytes_GET_SIZE(obj) == 0) {
3257 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3258 return NULL;
3259 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003260 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003261 }
3262 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003263 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3264 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003265 }
3266
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003267 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 PyErr_SetString(PyExc_TypeError,
3269 "decoding str is not supported");
3270 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003271 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003272
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003273 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3274 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3275 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003276 "decoding to str: need a bytes-like object, %.80s found",
3277 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003278 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003279 }
Tim Petersced69f82003-09-16 20:30:58 +00003280
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003281 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003282 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003283 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3284 return NULL;
3285 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003286 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003288
Serhiy Storchaka05997252013-01-26 12:14:02 +02003289 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003290 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003291 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292}
3293
Victor Stinnerebe17e02016-10-12 13:57:45 +02003294/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3295 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3296 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003297int
3298_Py_normalize_encoding(const char *encoding,
3299 char *lower,
3300 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003302 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003303 char *l;
3304 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003305 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306
Victor Stinner942889a2016-09-05 15:40:10 -07003307 assert(encoding != NULL);
3308
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003309 e = encoding;
3310 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003311 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003312 punct = 0;
3313 while (1) {
3314 char c = *e;
3315 if (c == 0) {
3316 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 }
Victor Stinner942889a2016-09-05 15:40:10 -07003318
3319 if (Py_ISALNUM(c) || c == '.') {
3320 if (punct && l != lower) {
3321 if (l == l_end) {
3322 return 0;
3323 }
3324 *l++ = '_';
3325 }
3326 punct = 0;
3327
3328 if (l == l_end) {
3329 return 0;
3330 }
3331 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
3333 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003334 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003335 }
Victor Stinner942889a2016-09-05 15:40:10 -07003336
3337 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003338 }
3339 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003340 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003341}
3342
Alexander Belopolsky40018472011-02-26 01:02:56 +00003343PyObject *
3344PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003345 Py_ssize_t size,
3346 const char *encoding,
3347 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003348{
3349 PyObject *buffer = NULL, *unicode;
3350 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003351 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3352
Victor Stinner22eb6892019-06-26 00:51:05 +02003353 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3354 return NULL;
3355 }
3356
Victor Stinnered076ed2019-06-26 01:49:32 +02003357 if (size == 0) {
3358 _Py_RETURN_UNICODE_EMPTY();
3359 }
3360
Victor Stinner942889a2016-09-05 15:40:10 -07003361 if (encoding == NULL) {
3362 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3363 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003364
Fred Drakee4315f52000-05-09 19:53:39 +00003365 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003366 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3367 char *lower = buflower;
3368
3369 /* Fast paths */
3370 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3371 lower += 3;
3372 if (*lower == '_') {
3373 /* Match "utf8" and "utf_8" */
3374 lower++;
3375 }
3376
3377 if (lower[0] == '8' && lower[1] == 0) {
3378 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3379 }
3380 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3381 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3382 }
3383 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3384 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3385 }
3386 }
3387 else {
3388 if (strcmp(lower, "ascii") == 0
3389 || strcmp(lower, "us_ascii") == 0) {
3390 return PyUnicode_DecodeASCII(s, size, errors);
3391 }
Steve Dowercc16be82016-09-08 10:35:16 -07003392 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003393 else if (strcmp(lower, "mbcs") == 0) {
3394 return PyUnicode_DecodeMBCS(s, size, errors);
3395 }
3396 #endif
3397 else if (strcmp(lower, "latin1") == 0
3398 || strcmp(lower, "latin_1") == 0
3399 || strcmp(lower, "iso_8859_1") == 0
3400 || strcmp(lower, "iso8859_1") == 0) {
3401 return PyUnicode_DecodeLatin1(s, size, errors);
3402 }
3403 }
Victor Stinner37296e82010-06-10 13:36:23 +00003404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405
3406 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003407 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003408 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003409 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003410 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 if (buffer == NULL)
3412 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003413 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414 if (unicode == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(unicode)) {
3417 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003418 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003419 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003420 encoding,
3421 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 Py_DECREF(unicode);
3423 goto onError;
3424 }
3425 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003426 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003427
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 Py_XDECREF(buffer);
3430 return NULL;
3431}
3432
Alexander Belopolsky40018472011-02-26 01:02:56 +00003433PyObject *
3434PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003435 const char *encoding,
3436 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003437{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003438 if (!PyUnicode_Check(unicode)) {
3439 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003440 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003441 }
3442
Serhiy Storchaka00939072016-10-27 21:05:49 +03003443 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3444 "PyUnicode_AsDecodedObject() is deprecated; "
3445 "use PyCodec_Decode() to decode from str", 1) < 0)
3446 return NULL;
3447
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003448 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003450
3451 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003452 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453}
3454
Alexander Belopolsky40018472011-02-26 01:02:56 +00003455PyObject *
3456PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003457 const char *encoding,
3458 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003459{
3460 PyObject *v;
3461
3462 if (!PyUnicode_Check(unicode)) {
3463 PyErr_BadArgument();
3464 goto onError;
3465 }
3466
Serhiy Storchaka00939072016-10-27 21:05:49 +03003467 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3468 "PyUnicode_AsDecodedUnicode() is deprecated; "
3469 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3470 return NULL;
3471
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474
3475 /* Decode via the codec registry */
3476 v = PyCodec_Decode(unicode, encoding, errors);
3477 if (v == NULL)
3478 goto onError;
3479 if (!PyUnicode_Check(v)) {
3480 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003481 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003482 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003483 encoding,
3484 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485 Py_DECREF(v);
3486 goto onError;
3487 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003488 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
Benjamin Peterson29060642009-01-31 22:14:21 +00003490 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003491 return NULL;
3492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494PyObject *
3495PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003496 Py_ssize_t size,
3497 const char *encoding,
3498 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
3500 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003501
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003502 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3506 Py_DECREF(unicode);
3507 return v;
3508}
3509
Alexander Belopolsky40018472011-02-26 01:02:56 +00003510PyObject *
3511PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003512 const char *encoding,
3513 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003514{
3515 PyObject *v;
3516
3517 if (!PyUnicode_Check(unicode)) {
3518 PyErr_BadArgument();
3519 goto onError;
3520 }
3521
Serhiy Storchaka00939072016-10-27 21:05:49 +03003522 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3523 "PyUnicode_AsEncodedObject() is deprecated; "
3524 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3525 "or PyCodec_Encode() for generic encoding", 1) < 0)
3526 return NULL;
3527
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003528 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003529 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003530
3531 /* Encode via the codec registry */
3532 v = PyCodec_Encode(unicode, encoding, errors);
3533 if (v == NULL)
3534 goto onError;
3535 return v;
3536
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003538 return NULL;
3539}
3540
Victor Stinner1b579672011-12-17 05:47:23 +01003541
Victor Stinner2cba6b82018-01-10 22:46:15 +01003542static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003543unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003544 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003545{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003546 Py_ssize_t wlen;
3547 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3548 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003549 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003550 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003551
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003552 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003553 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003554 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003555 return NULL;
3556 }
3557
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003558 char *str;
3559 size_t error_pos;
3560 const char *reason;
3561 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003562 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003563 PyMem_Free(wstr);
3564
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 if (res != 0) {
3566 if (res == -2) {
3567 PyObject *exc;
3568 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3569 "locale", unicode,
3570 (Py_ssize_t)error_pos,
3571 (Py_ssize_t)(error_pos+1),
3572 reason);
3573 if (exc != NULL) {
3574 PyCodec_StrictErrors(exc);
3575 Py_DECREF(exc);
3576 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003577 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003578 else if (res == -3) {
3579 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3580 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003581 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003582 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003583 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003584 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003585 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003586
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003587 PyObject *bytes = PyBytes_FromString(str);
3588 PyMem_RawFree(str);
3589 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003590}
3591
Victor Stinnerad158722010-10-27 00:25:46 +00003592PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003593PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3594{
Victor Stinner709d23d2019-05-02 14:56:30 -04003595 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3596 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003597}
3598
3599PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003600PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003601{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003602 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003603#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003604 if (interp->fs_codec.encoding) {
3605 return unicode_encode_utf8(unicode,
3606 interp->fs_codec.error_handler,
3607 interp->fs_codec.errors);
3608 }
3609 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003610 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003611 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003612 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003613 assert(errors != _Py_ERROR_UNKNOWN);
3614 return unicode_encode_utf8(unicode, errors, NULL);
3615 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003616#else
Victor Stinner793b5312011-04-27 00:24:21 +02003617 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3618 cannot use it to encode and decode filenames before it is loaded. Load
3619 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003620 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003621 initialized and the Python codec is loaded.
3622 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003623 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003624 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003625 interp->fs_codec.encoding,
3626 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003627 }
3628 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003629 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003630 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003631 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003632 assert(errors != _Py_ERROR_UNKNOWN);
3633 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003634 }
Victor Stinnerad158722010-10-27 00:25:46 +00003635#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003636}
3637
Alexander Belopolsky40018472011-02-26 01:02:56 +00003638PyObject *
3639PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003640 const char *encoding,
3641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
3643 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003644 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003645
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 if (!PyUnicode_Check(unicode)) {
3647 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 }
Fred Drakee4315f52000-05-09 19:53:39 +00003650
Victor Stinner22eb6892019-06-26 00:51:05 +02003651 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3652 return NULL;
3653 }
3654
Victor Stinner942889a2016-09-05 15:40:10 -07003655 if (encoding == NULL) {
3656 return _PyUnicode_AsUTF8String(unicode, errors);
3657 }
3658
Fred Drakee4315f52000-05-09 19:53:39 +00003659 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003660 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3661 char *lower = buflower;
3662
3663 /* Fast paths */
3664 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3665 lower += 3;
3666 if (*lower == '_') {
3667 /* Match "utf8" and "utf_8" */
3668 lower++;
3669 }
3670
3671 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003672 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003673 }
3674 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3675 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3676 }
3677 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3678 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3679 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003680 }
Victor Stinner942889a2016-09-05 15:40:10 -07003681 else {
3682 if (strcmp(lower, "ascii") == 0
3683 || strcmp(lower, "us_ascii") == 0) {
3684 return _PyUnicode_AsASCIIString(unicode, errors);
3685 }
Steve Dowercc16be82016-09-08 10:35:16 -07003686#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003687 else if (strcmp(lower, "mbcs") == 0) {
3688 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3689 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003690#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003691 else if (strcmp(lower, "latin1") == 0 ||
3692 strcmp(lower, "latin_1") == 0 ||
3693 strcmp(lower, "iso_8859_1") == 0 ||
3694 strcmp(lower, "iso8859_1") == 0) {
3695 return _PyUnicode_AsLatin1String(unicode, errors);
3696 }
3697 }
Victor Stinner37296e82010-06-10 13:36:23 +00003698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699
3700 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003701 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003703 return NULL;
3704
3705 /* The normal path */
3706 if (PyBytes_Check(v))
3707 return v;
3708
3709 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003710 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003711 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003712 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003713
3714 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003715 "encoder %s returned bytearray instead of bytes; "
3716 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003717 encoding);
3718 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003719 Py_DECREF(v);
3720 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003721 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003722
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003723 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3724 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003725 Py_DECREF(v);
3726 return b;
3727 }
3728
3729 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003730 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003731 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003732 encoding,
3733 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003734 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003735 return NULL;
3736}
3737
Alexander Belopolsky40018472011-02-26 01:02:56 +00003738PyObject *
3739PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003740 const char *encoding,
3741 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003742{
3743 PyObject *v;
3744
3745 if (!PyUnicode_Check(unicode)) {
3746 PyErr_BadArgument();
3747 goto onError;
3748 }
3749
Serhiy Storchaka00939072016-10-27 21:05:49 +03003750 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3751 "PyUnicode_AsEncodedUnicode() is deprecated; "
3752 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3753 return NULL;
3754
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003755 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003757
3758 /* Encode via the codec registry */
3759 v = PyCodec_Encode(unicode, encoding, errors);
3760 if (v == NULL)
3761 goto onError;
3762 if (!PyUnicode_Check(v)) {
3763 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003764 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003765 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003766 encoding,
3767 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003768 Py_DECREF(v);
3769 goto onError;
3770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003772
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774 return NULL;
3775}
3776
Victor Stinner2cba6b82018-01-10 22:46:15 +01003777static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003778unicode_decode_locale(const char *str, Py_ssize_t len,
3779 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003780{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003781 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3782 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003783 return NULL;
3784 }
3785
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003786 wchar_t *wstr;
3787 size_t wlen;
3788 const char *reason;
3789 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003790 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003791 if (res != 0) {
3792 if (res == -2) {
3793 PyObject *exc;
3794 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3795 "locale", str, len,
3796 (Py_ssize_t)wlen,
3797 (Py_ssize_t)(wlen + 1),
3798 reason);
3799 if (exc != NULL) {
3800 PyCodec_StrictErrors(exc);
3801 Py_DECREF(exc);
3802 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003803 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003804 else if (res == -3) {
3805 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3806 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003807 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003808 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003809 }
Victor Stinner2f197072011-12-17 07:08:30 +01003810 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003811 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003812
3813 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3814 PyMem_RawFree(wstr);
3815 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003816}
3817
3818PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003819PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3820 const char *errors)
3821{
Victor Stinner709d23d2019-05-02 14:56:30 -04003822 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3823 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003824}
3825
3826PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003827PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003828{
3829 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003830 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3831 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003832}
3833
3834
3835PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003836PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003837 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003838 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3839}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003840
Christian Heimes5894ba72007-11-04 11:43:14 +00003841PyObject*
3842PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3843{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003844 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003845#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003846 if (interp->fs_codec.encoding) {
3847 return unicode_decode_utf8(s, size,
3848 interp->fs_codec.error_handler,
3849 interp->fs_codec.errors,
3850 NULL);
3851 }
3852 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003853 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003855 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003856 assert(errors != _Py_ERROR_UNKNOWN);
3857 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3858 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003859#else
Victor Stinner793b5312011-04-27 00:24:21 +02003860 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3861 cannot use it to encode and decode filenames before it is loaded. Load
3862 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003863 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003864 initialized and the Python codec is loaded.
3865 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003866 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003867 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003868 interp->fs_codec.encoding,
3869 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003870 }
3871 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003872 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003873 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003874 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003875 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003876 }
Victor Stinnerad158722010-10-27 00:25:46 +00003877#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003878}
3879
Martin v. Löwis011e8422009-05-05 04:43:17 +00003880
3881int
3882PyUnicode_FSConverter(PyObject* arg, void* addr)
3883{
Brett Cannonec6ce872016-09-06 15:50:29 -07003884 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003885 PyObject *output = NULL;
3886 Py_ssize_t size;
3887 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003888 if (arg == NULL) {
3889 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003890 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 return 1;
3892 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003893 path = PyOS_FSPath(arg);
3894 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003895 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003897 if (PyBytes_Check(path)) {
3898 output = path;
3899 }
3900 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3901 output = PyUnicode_EncodeFSDefault(path);
3902 Py_DECREF(path);
3903 if (!output) {
3904 return 0;
3905 }
3906 assert(PyBytes_Check(output));
3907 }
3908
Victor Stinner0ea2a462010-04-30 00:22:08 +00003909 size = PyBytes_GET_SIZE(output);
3910 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003911 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003912 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003913 Py_DECREF(output);
3914 return 0;
3915 }
3916 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003917 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003918}
3919
3920
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003921int
3922PyUnicode_FSDecoder(PyObject* arg, void* addr)
3923{
Brett Cannona5711202016-09-06 19:36:01 -07003924 int is_buffer = 0;
3925 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 if (arg == NULL) {
3928 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003929 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003930 return 1;
3931 }
Brett Cannona5711202016-09-06 19:36:01 -07003932
3933 is_buffer = PyObject_CheckBuffer(arg);
3934 if (!is_buffer) {
3935 path = PyOS_FSPath(arg);
3936 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003937 return 0;
3938 }
Brett Cannona5711202016-09-06 19:36:01 -07003939 }
3940 else {
3941 path = arg;
3942 Py_INCREF(arg);
3943 }
3944
3945 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003946 output = path;
3947 }
3948 else if (PyBytes_Check(path) || is_buffer) {
3949 PyObject *path_bytes = NULL;
3950
3951 if (!PyBytes_Check(path) &&
3952 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003953 "path should be string, bytes, or os.PathLike, not %.200s",
3954 Py_TYPE(arg)->tp_name)) {
3955 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003956 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003957 }
3958 path_bytes = PyBytes_FromObject(path);
3959 Py_DECREF(path);
3960 if (!path_bytes) {
3961 return 0;
3962 }
3963 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3964 PyBytes_GET_SIZE(path_bytes));
3965 Py_DECREF(path_bytes);
3966 if (!output) {
3967 return 0;
3968 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003969 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003970 else {
3971 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003972 "path should be string, bytes, or os.PathLike, not %.200s",
3973 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003974 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003975 return 0;
3976 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003977 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003978 Py_DECREF(output);
3979 return 0;
3980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003982 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003983 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003984 Py_DECREF(output);
3985 return 0;
3986 }
3987 *(PyObject**)addr = output;
3988 return Py_CLEANUP_SUPPORTED;
3989}
3990
3991
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003992const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003994{
Christian Heimesf3863112007-11-22 07:46:41 +00003995 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 return NULL;
4000 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004002 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004004 if (PyUnicode_UTF8(unicode) == NULL) {
4005 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004006 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 if (bytes == NULL)
4008 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004009 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4010 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004011 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 Py_DECREF(bytes);
4013 return NULL;
4014 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004015 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004016 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 PyBytes_AS_STRING(bytes),
4018 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_DECREF(bytes);
4020 }
4021
4022 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004023 *psize = PyUnicode_UTF8_LENGTH(unicode);
4024 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004025}
4026
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004027const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4031}
4032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033Py_UNICODE *
4034PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004040 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4041 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004043 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004044 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045
Serhiy Storchakac46db922018-10-23 22:58:24 +03004046 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4047 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4048 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004051 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4052 if (w == NULL) {
4053 PyErr_NoMemory();
4054 return NULL;
4055 }
4056 unicode_copy_as_widechar(unicode, w, wlen + 1);
4057 _PyUnicode_WSTR(unicode) = w;
4058 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4059 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 }
4061 }
4062 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004063 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004064 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004065}
4066
Alexander Belopolsky40018472011-02-26 01:02:56 +00004067Py_UNICODE *
4068PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071}
4072
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004073const Py_UNICODE *
4074_PyUnicode_AsUnicode(PyObject *unicode)
4075{
4076 Py_ssize_t size;
4077 const Py_UNICODE *wstr;
4078
4079 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4080 if (wstr && wcslen(wstr) != (size_t)size) {
4081 PyErr_SetString(PyExc_ValueError, "embedded null character");
4082 return NULL;
4083 }
4084 return wstr;
4085}
4086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004087
Alexander Belopolsky40018472011-02-26 01:02:56 +00004088Py_ssize_t
4089PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090{
4091 if (!PyUnicode_Check(unicode)) {
4092 PyErr_BadArgument();
4093 goto onError;
4094 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004095 if (_PyUnicode_WSTR(unicode) == NULL) {
4096 if (PyUnicode_AsUnicode(unicode) == NULL)
4097 goto onError;
4098 }
4099 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100
Benjamin Peterson29060642009-01-31 22:14:21 +00004101 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 return -1;
4103}
4104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004105Py_ssize_t
4106PyUnicode_GetLength(PyObject *unicode)
4107{
Victor Stinner07621332012-06-16 04:53:46 +02004108 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004109 PyErr_BadArgument();
4110 return -1;
4111 }
Victor Stinner07621332012-06-16 04:53:46 +02004112 if (PyUnicode_READY(unicode) == -1)
4113 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 return PyUnicode_GET_LENGTH(unicode);
4115}
4116
4117Py_UCS4
4118PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4119{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004120 void *data;
4121 int kind;
4122
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004123 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004124 PyErr_BadArgument();
4125 return (Py_UCS4)-1;
4126 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004127 if (PyUnicode_READY(unicode) == -1) {
4128 return (Py_UCS4)-1;
4129 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004130 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004131 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 return (Py_UCS4)-1;
4133 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004134 data = PyUnicode_DATA(unicode);
4135 kind = PyUnicode_KIND(unicode);
4136 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137}
4138
4139int
4140PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4141{
4142 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004143 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 return -1;
4145 }
Victor Stinner488fa492011-12-12 00:01:39 +01004146 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004147 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004148 PyErr_SetString(PyExc_IndexError, "string index out of range");
4149 return -1;
4150 }
Victor Stinner488fa492011-12-12 00:01:39 +01004151 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004152 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004153 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4154 PyErr_SetString(PyExc_ValueError, "character out of range");
4155 return -1;
4156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004157 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4158 index, ch);
4159 return 0;
4160}
4161
Alexander Belopolsky40018472011-02-26 01:02:56 +00004162const char *
4163PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004164{
Victor Stinner42cb4622010-09-01 19:39:01 +00004165 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004166}
4167
Victor Stinner554f3f02010-06-16 23:33:54 +00004168/* create or adjust a UnicodeDecodeError */
4169static void
4170make_decode_exception(PyObject **exceptionObject,
4171 const char *encoding,
4172 const char *input, Py_ssize_t length,
4173 Py_ssize_t startpos, Py_ssize_t endpos,
4174 const char *reason)
4175{
4176 if (*exceptionObject == NULL) {
4177 *exceptionObject = PyUnicodeDecodeError_Create(
4178 encoding, input, length, startpos, endpos, reason);
4179 }
4180 else {
4181 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4182 goto onError;
4183 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4186 goto onError;
4187 }
4188 return;
4189
4190onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004191 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004192}
4193
Steve Dowercc16be82016-09-08 10:35:16 -07004194#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004195static int
4196widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4197{
4198 if (newsize > *size) {
4199 wchar_t *newbuf = *buf;
4200 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4201 PyErr_NoMemory();
4202 return -1;
4203 }
4204 *buf = newbuf;
4205 }
4206 *size = newsize;
4207 return 0;
4208}
4209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004210/* error handling callback helper:
4211 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004212 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 and adjust various state variables.
4214 return 0 on success, -1 on error
4215*/
4216
Alexander Belopolsky40018472011-02-26 01:02:56 +00004217static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004218unicode_decode_call_errorhandler_wchar(
4219 const char *errors, PyObject **errorHandler,
4220 const char *encoding, const char *reason,
4221 const char **input, const char **inend, Py_ssize_t *startinpos,
4222 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004223 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004225 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226
4227 PyObject *restuple = NULL;
4228 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004229 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004230 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004231 Py_ssize_t requiredsize;
4232 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004233 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004234 wchar_t *repwstr;
4235 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236
4237 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 *errorHandler = PyCodec_LookupError(errors);
4239 if (*errorHandler == NULL)
4240 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241 }
4242
Victor Stinner554f3f02010-06-16 23:33:54 +00004243 make_decode_exception(exceptionObject,
4244 encoding,
4245 *input, *inend - *input,
4246 *startinpos, *endinpos,
4247 reason);
4248 if (*exceptionObject == NULL)
4249 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004250
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004251 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004255 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004258 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004260
4261 /* Copy back the bytes variables, which might have been modified by the
4262 callback */
4263 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4264 if (!inputobj)
4265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266 *input = PyBytes_AS_STRING(inputobj);
4267 insize = PyBytes_GET_SIZE(inputobj);
4268 *inend = *input + insize;
4269 /* we can DECREF safely, as the exception has another reference,
4270 so the object won't go away. */
4271 Py_DECREF(inputobj);
4272
4273 if (newpos<0)
4274 newpos = insize+newpos;
4275 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004276 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277 goto onError;
4278 }
4279
4280 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4281 if (repwstr == NULL)
4282 goto onError;
4283 /* need more space? (at least enough for what we
4284 have+the replacement+the rest of the string (starting
4285 at the new input position), so we won't have to check space
4286 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004287 requiredsize = *outpos;
4288 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4289 goto overflow;
4290 requiredsize += repwlen;
4291 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4292 goto overflow;
4293 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004294 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004296 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004298 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004300 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004302 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 *endinpos = newpos;
4305 *inptr = *input + newpos;
4306
4307 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004308 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004309 return 0;
4310
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004311 overflow:
4312 PyErr_SetString(PyExc_OverflowError,
4313 "decoded result is too long for a Python string");
4314
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 onError:
4316 Py_XDECREF(restuple);
4317 return -1;
4318}
Steve Dowercc16be82016-09-08 10:35:16 -07004319#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320
4321static int
4322unicode_decode_call_errorhandler_writer(
4323 const char *errors, PyObject **errorHandler,
4324 const char *encoding, const char *reason,
4325 const char **input, const char **inend, Py_ssize_t *startinpos,
4326 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4327 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4328{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004329 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330
4331 PyObject *restuple = NULL;
4332 PyObject *repunicode = NULL;
4333 Py_ssize_t insize;
4334 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004335 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004336 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004338 int need_to_grow = 0;
4339 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004340
4341 if (*errorHandler == NULL) {
4342 *errorHandler = PyCodec_LookupError(errors);
4343 if (*errorHandler == NULL)
4344 goto onError;
4345 }
4346
4347 make_decode_exception(exceptionObject,
4348 encoding,
4349 *input, *inend - *input,
4350 *startinpos, *endinpos,
4351 reason);
4352 if (*exceptionObject == NULL)
4353 goto onError;
4354
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004355 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 if (restuple == NULL)
4357 goto onError;
4358 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004359 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 goto onError;
4361 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004362 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004363 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004364
4365 /* Copy back the bytes variables, which might have been modified by the
4366 callback */
4367 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4368 if (!inputobj)
4369 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004370 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004371 *input = PyBytes_AS_STRING(inputobj);
4372 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004373 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004374 /* we can DECREF safely, as the exception has another reference,
4375 so the object won't go away. */
4376 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004380 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004381 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004382 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004384
Victor Stinner170ca6f2013-04-18 00:25:28 +02004385 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004386 if (replen > 1) {
4387 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004388 need_to_grow = 1;
4389 }
4390 new_inptr = *input + newpos;
4391 if (*inend - new_inptr > remain) {
4392 /* We don't know the decoding algorithm here so we make the worst
4393 assumption that one byte decodes to one unicode character.
4394 If unfortunately one byte could decode to more unicode characters,
4395 the decoder may write out-of-bound then. Is it possible for the
4396 algorithms using this function? */
4397 writer->min_length += *inend - new_inptr - remain;
4398 need_to_grow = 1;
4399 }
4400 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004401 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004402 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004403 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4404 goto onError;
4405 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004407 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004410 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004412 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004413 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004414 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004415
Benjamin Peterson29060642009-01-31 22:14:21 +00004416 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419}
4420
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421/* --- UTF-7 Codec -------------------------------------------------------- */
4422
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423/* See RFC2152 for details. We encode conservatively and decode liberally. */
4424
4425/* Three simple macros defining base-64. */
4426
4427/* Is c a base-64 character? */
4428
4429#define IS_BASE64(c) \
4430 (((c) >= 'A' && (c) <= 'Z') || \
4431 ((c) >= 'a' && (c) <= 'z') || \
4432 ((c) >= '0' && (c) <= '9') || \
4433 (c) == '+' || (c) == '/')
4434
4435/* given that c is a base-64 character, what is its base-64 value? */
4436
4437#define FROM_BASE64(c) \
4438 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4439 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4440 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4441 (c) == '+' ? 62 : 63)
4442
4443/* What is the base-64 character of the bottom 6 bits of n? */
4444
4445#define TO_BASE64(n) \
4446 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4447
4448/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4449 * decoded as itself. We are permissive on decoding; the only ASCII
4450 * byte not decoding to itself is the + which begins a base64
4451 * string. */
4452
4453#define DECODE_DIRECT(c) \
4454 ((c) <= 127 && (c) != '+')
4455
4456/* The UTF-7 encoder treats ASCII characters differently according to
4457 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4458 * the above). See RFC2152. This array identifies these different
4459 * sets:
4460 * 0 : "Set D"
4461 * alphanumeric and '(),-./:?
4462 * 1 : "Set O"
4463 * !"#$%&*;<=>@[]^_`{|}
4464 * 2 : "whitespace"
4465 * ht nl cr sp
4466 * 3 : special (must be base64 encoded)
4467 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4468 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469
Tim Petersced69f82003-09-16 20:30:58 +00004470static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471char utf7_category[128] = {
4472/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4473 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4474/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4475 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4476/* sp ! " # $ % & ' ( ) * + , - . / */
4477 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4478/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4480/* @ A B C D E F G H I J K L M N O */
4481 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4482/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4483 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4484/* ` a b c d e f g h i j k l m n o */
4485 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4486/* p q r s t u v w x y z { | } ~ del */
4487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004488};
4489
Antoine Pitrou244651a2009-05-04 18:56:13 +00004490/* ENCODE_DIRECT: this character should be encoded as itself. The
4491 * answer depends on whether we are encoding set O as itself, and also
4492 * on whether we are encoding whitespace as itself. RFC2152 makes it
4493 * clear that the answers to these questions vary between
4494 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004495
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496#define ENCODE_DIRECT(c, directO, directWS) \
4497 ((c) < 128 && (c) > 0 && \
4498 ((utf7_category[(c)] == 0) || \
4499 (directWS && (utf7_category[(c)] == 2)) || \
4500 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004501
Alexander Belopolsky40018472011-02-26 01:02:56 +00004502PyObject *
4503PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004504 Py_ssize_t size,
4505 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004507 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4508}
4509
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510/* The decoder. The only state we preserve is our read position,
4511 * i.e. how many characters we have consumed. So if we end in the
4512 * middle of a shift sequence we have to back off the read position
4513 * and the output to the beginning of the sequence, otherwise we lose
4514 * all the shift state (seen bits, number of bits seen, high
4515 * surrogate). */
4516
Alexander Belopolsky40018472011-02-26 01:02:56 +00004517PyObject *
4518PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004519 Py_ssize_t size,
4520 const char *errors,
4521 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004524 Py_ssize_t startinpos;
4525 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004527 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 const char *errmsg = "";
4529 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004530 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 unsigned int base64bits = 0;
4532 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004533 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 PyObject *errorHandler = NULL;
4535 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537 if (size == 0) {
4538 if (consumed)
4539 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004540 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004544 _PyUnicodeWriter_Init(&writer);
4545 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004546
4547 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 e = s + size;
4549
4550 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004551 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004553 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 if (inShift) { /* in a base-64 section */
4556 if (IS_BASE64(ch)) { /* consume a base-64 character */
4557 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4558 base64bits += 6;
4559 s++;
4560 if (base64bits >= 16) {
4561 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004562 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004563 base64bits -= 16;
4564 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004565 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (surrogate) {
4567 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004568 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4569 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004570 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004571 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004573 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 }
4575 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004576 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004577 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
4580 }
Victor Stinner551ac952011-11-29 22:58:13 +01004581 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004582 /* first surrogate */
4583 surrogate = outCh;
4584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004586 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004587 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 }
4589 }
4590 }
4591 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 if (base64bits > 0) { /* left-over bits */
4594 if (base64bits >= 6) {
4595 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004596 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 errmsg = "partial character in shift sequence";
4598 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 else {
4601 /* Some bits remain; they should be zero */
4602 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004603 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 errmsg = "non-zero padding bits in shift sequence";
4605 goto utf7Error;
4606 }
4607 }
4608 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004609 if (surrogate && DECODE_DIRECT(ch)) {
4610 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4611 goto onError;
4612 }
4613 surrogate = 0;
4614 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 /* '-' is absorbed; other terminating
4616 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004617 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004619 }
4620 }
4621 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004622 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004623 s++; /* consume '+' */
4624 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004626 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004627 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004629 else if (s < e && !IS_BASE64(*s)) {
4630 s++;
4631 errmsg = "ill-formed sequence";
4632 goto utf7Error;
4633 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004635 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004636 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004637 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004639 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 }
4641 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004643 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004644 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004645 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004647 else {
4648 startinpos = s-starts;
4649 s++;
4650 errmsg = "unexpected special character";
4651 goto utf7Error;
4652 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004656 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 errors, &errorHandler,
4658 "utf7", errmsg,
4659 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004660 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004661 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662 }
4663
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664 /* end of string */
4665
4666 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4667 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004668 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 if (surrogate ||
4670 (base64bits >= 6) ||
4671 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004672 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 errors, &errorHandler,
4675 "utf7", "unterminated shift sequence",
4676 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004677 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004678 goto onError;
4679 if (s < e)
4680 goto restart;
4681 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683
4684 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004685 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004686 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004688 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004689 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004690 writer.kind, writer.data, shiftOutStart);
4691 Py_XDECREF(errorHandler);
4692 Py_XDECREF(exc);
4693 _PyUnicodeWriter_Dealloc(&writer);
4694 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004695 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004696 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 }
4698 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004699 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004700 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 Py_XDECREF(errorHandler);
4704 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004705 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004706
Benjamin Peterson29060642009-01-31 22:14:21 +00004707 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 Py_XDECREF(errorHandler);
4709 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004711 return NULL;
4712}
4713
4714
Alexander Belopolsky40018472011-02-26 01:02:56 +00004715PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004716_PyUnicode_EncodeUTF7(PyObject *str,
4717 int base64SetO,
4718 int base64WhiteSpace,
4719 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004721 int kind;
4722 void *data;
4723 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004724 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004725 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004726 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 unsigned int base64bits = 0;
4728 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729 char * out;
4730 char * start;
4731
Benjamin Petersonbac79492012-01-14 13:34:47 -05004732 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004733 return NULL;
4734 kind = PyUnicode_KIND(str);
4735 data = PyUnicode_DATA(str);
4736 len = PyUnicode_GET_LENGTH(str);
4737
4738 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004739 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004742 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004743 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004744 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004745 if (v == NULL)
4746 return NULL;
4747
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004749 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004750 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751
Antoine Pitrou244651a2009-05-04 18:56:13 +00004752 if (inShift) {
4753 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4754 /* shifting out */
4755 if (base64bits) { /* output remaining bits */
4756 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4757 base64buffer = 0;
4758 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 }
4760 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004761 /* Characters not in the BASE64 set implicitly unshift the sequence
4762 so no '-' is required, except if the character is itself a '-' */
4763 if (IS_BASE64(ch) || ch == '-') {
4764 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004765 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 *out++ = (char) ch;
4767 }
4768 else {
4769 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004770 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004771 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 else { /* not in a shift sequence */
4773 if (ch == '+') {
4774 *out++ = '+';
4775 *out++ = '-';
4776 }
4777 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4778 *out++ = (char) ch;
4779 }
4780 else {
4781 *out++ = '+';
4782 inShift = 1;
4783 goto encode_char;
4784 }
4785 }
4786 continue;
4787encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004789 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004790
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 /* code first surrogate */
4792 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004793 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 while (base64bits >= 6) {
4795 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4796 base64bits -= 6;
4797 }
4798 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004799 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 base64bits += 16;
4802 base64buffer = (base64buffer << 16) | ch;
4803 while (base64bits >= 6) {
4804 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4805 base64bits -= 6;
4806 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 if (base64bits)
4809 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4810 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004812 if (_PyBytes_Resize(&v, out - start) < 0)
4813 return NULL;
4814 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004815}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004816PyObject *
4817PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4818 Py_ssize_t size,
4819 int base64SetO,
4820 int base64WhiteSpace,
4821 const char *errors)
4822{
4823 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004824 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004825 if (tmp == NULL)
4826 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004827 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004828 base64WhiteSpace, errors);
4829 Py_DECREF(tmp);
4830 return result;
4831}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004832
Antoine Pitrou244651a2009-05-04 18:56:13 +00004833#undef IS_BASE64
4834#undef FROM_BASE64
4835#undef TO_BASE64
4836#undef DECODE_DIRECT
4837#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004838
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839/* --- UTF-8 Codec -------------------------------------------------------- */
4840
Alexander Belopolsky40018472011-02-26 01:02:56 +00004841PyObject *
4842PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004843 Py_ssize_t size,
4844 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Walter Dörwald69652032004-09-07 20:24:22 +00004846 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4847}
4848
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#include "stringlib/asciilib.h"
4850#include "stringlib/codecs.h"
4851#include "stringlib/undef.h"
4852
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004853#include "stringlib/ucs1lib.h"
4854#include "stringlib/codecs.h"
4855#include "stringlib/undef.h"
4856
4857#include "stringlib/ucs2lib.h"
4858#include "stringlib/codecs.h"
4859#include "stringlib/undef.h"
4860
4861#include "stringlib/ucs4lib.h"
4862#include "stringlib/codecs.h"
4863#include "stringlib/undef.h"
4864
Antoine Pitrouab868312009-01-10 15:40:25 +00004865/* Mask to quickly check whether a C 'long' contains a
4866 non-ASCII, UTF8-encoded char. */
4867#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004868# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004869#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004870# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004871#else
4872# error C 'long' size should be either 4 or 8!
4873#endif
4874
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004875static Py_ssize_t
4876ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004879 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004881 /*
4882 * Issue #17237: m68k is a bit different from most architectures in
4883 * that objects do not use "natural alignment" - for example, int and
4884 * long are only aligned at 2-byte boundaries. Therefore the assert()
4885 * won't work; also, tests have shown that skipping the "optimised
4886 * version" will even speed up m68k.
4887 */
4888#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004890 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4891 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 /* Fast path, see in STRINGLIB(utf8_decode) for
4893 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004894 /* Help allocation */
4895 const char *_p = p;
4896 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004897 while (_p < aligned_end) {
4898 unsigned long value = *(const unsigned long *) _p;
4899 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901 *((unsigned long *)q) = value;
4902 _p += SIZEOF_LONG;
4903 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004904 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 p = _p;
4906 while (p < end) {
4907 if ((unsigned char)*p & 0x80)
4908 break;
4909 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004914#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915 while (p < end) {
4916 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4917 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004918 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004919 /* Help allocation */
4920 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 while (_p < aligned_end) {
4922 unsigned long value = *(unsigned long *) _p;
4923 if (value & ASCII_CHAR_MASK)
4924 break;
4925 _p += SIZEOF_LONG;
4926 }
4927 p = _p;
4928 if (_p == end)
4929 break;
4930 }
4931 if ((unsigned char)*p & 0x80)
4932 break;
4933 ++p;
4934 }
4935 memcpy(dest, start, p - start);
4936 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937}
Antoine Pitrouab868312009-01-10 15:40:25 +00004938
Victor Stinner709d23d2019-05-02 14:56:30 -04004939static PyObject *
4940unicode_decode_utf8(const char *s, Py_ssize_t size,
4941 _Py_error_handler error_handler, const char *errors,
4942 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004943{
Victor Stinner785938e2011-12-11 20:09:03 +01004944 if (size == 0) {
4945 if (consumed)
4946 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004947 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004948 }
4949
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004950 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4951 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004952 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004953 *consumed = 1;
4954 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004955 }
4956
Inada Naoki770847a2019-06-24 12:30:24 +09004957 const char *starts = s;
4958 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004959
Inada Naoki770847a2019-06-24 12:30:24 +09004960 // fast path: try ASCII string.
4961 PyObject *u = PyUnicode_New(size, 127);
4962 if (u == NULL) {
4963 return NULL;
4964 }
4965 s += ascii_decode(s, end, PyUnicode_DATA(u));
4966 if (s == end) {
4967 return u;
4968 }
4969
4970 // Use _PyUnicodeWriter after fast path is failed.
4971 _PyUnicodeWriter writer;
4972 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4973 writer.pos = s - starts;
4974
4975 Py_ssize_t startinpos, endinpos;
4976 const char *errmsg = "";
4977 PyObject *error_handler_obj = NULL;
4978 PyObject *exc = NULL;
4979
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004980 while (s < end) {
4981 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004982 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004983
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004984 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 if (PyUnicode_IS_ASCII(writer.buffer))
4986 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 } else {
4992 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 }
4995
4996 switch (ch) {
4997 case 0:
4998 if (s == end || consumed)
4999 goto End;
5000 errmsg = "unexpected end of data";
5001 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005002 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005003 break;
5004 case 1:
5005 errmsg = "invalid start byte";
5006 startinpos = s - starts;
5007 endinpos = startinpos + 1;
5008 break;
5009 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005010 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5011 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5012 {
5013 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005014 goto End;
5015 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005016 /* fall through */
5017 case 3:
5018 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 errmsg = "invalid continuation byte";
5020 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005021 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005022 break;
5023 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005024 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005025 goto onError;
5026 continue;
5027 }
5028
Victor Stinner1d65d912015-10-05 13:43:50 +02005029 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005030 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005031
5032 switch (error_handler) {
5033 case _Py_ERROR_IGNORE:
5034 s += (endinpos - startinpos);
5035 break;
5036
5037 case _Py_ERROR_REPLACE:
5038 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5039 goto onError;
5040 s += (endinpos - startinpos);
5041 break;
5042
5043 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005044 {
5045 Py_ssize_t i;
5046
Victor Stinner1d65d912015-10-05 13:43:50 +02005047 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5048 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005049 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005050 ch = (Py_UCS4)(unsigned char)(starts[i]);
5051 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5052 ch + 0xdc00);
5053 writer.pos++;
5054 }
5055 s += (endinpos - startinpos);
5056 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005057 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005058
5059 default:
5060 if (unicode_decode_call_errorhandler_writer(
5061 errors, &error_handler_obj,
5062 "utf-8", errmsg,
5063 &starts, &end, &startinpos, &endinpos, &exc, &s,
5064 &writer))
5065 goto onError;
5066 }
Victor Stinner785938e2011-12-11 20:09:03 +01005067 }
5068
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005070 if (consumed)
5071 *consumed = s - starts;
5072
Victor Stinner1d65d912015-10-05 13:43:50 +02005073 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005074 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005075 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076
5077onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005078 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005080 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005082}
5083
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084
Victor Stinner709d23d2019-05-02 14:56:30 -04005085PyObject *
5086PyUnicode_DecodeUTF8Stateful(const char *s,
5087 Py_ssize_t size,
5088 const char *errors,
5089 Py_ssize_t *consumed)
5090{
5091 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5092}
5093
5094
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005095/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5096 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005097
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005098 On success, write a pointer to a newly allocated wide character string into
5099 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5100 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005101
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102 On memory allocation failure, return -1.
5103
5104 On decoding error (if surrogateescape is zero), return -2. If wlen is
5105 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5106 is not NULL, write the decoding error message into *reason. */
5107int
5108_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005109 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005110{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005111 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005113 wchar_t *unicode;
5114 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115
Victor Stinner3d4226a2018-08-29 22:21:32 +02005116 int surrogateescape = 0;
5117 int surrogatepass = 0;
5118 switch (errors)
5119 {
5120 case _Py_ERROR_STRICT:
5121 break;
5122 case _Py_ERROR_SURROGATEESCAPE:
5123 surrogateescape = 1;
5124 break;
5125 case _Py_ERROR_SURROGATEPASS:
5126 surrogatepass = 1;
5127 break;
5128 default:
5129 return -3;
5130 }
5131
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 /* Note: size will always be longer than the resulting Unicode
5133 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005134 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005135 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 }
5137
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005138 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005139 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005140 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005141 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142
5143 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 if (ch > 0xFF) {
5154#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005155 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005157 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005158 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005159 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5160 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5161#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005162 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005164 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005166 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005167
5168 if (surrogateescape) {
5169 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5170 }
5171 else {
5172 /* Is it a valid three-byte code? */
5173 if (surrogatepass
5174 && (e - s) >= 3
5175 && (s[0] & 0xf0) == 0xe0
5176 && (s[1] & 0xc0) == 0x80
5177 && (s[2] & 0xc0) == 0x80)
5178 {
5179 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5180 s += 3;
5181 unicode[outpos++] = ch;
5182 }
5183 else {
5184 PyMem_RawFree(unicode );
5185 if (reason != NULL) {
5186 switch (ch) {
5187 case 0:
5188 *reason = "unexpected end of data";
5189 break;
5190 case 1:
5191 *reason = "invalid start byte";
5192 break;
5193 /* 2, 3, 4 */
5194 default:
5195 *reason = "invalid continuation byte";
5196 break;
5197 }
5198 }
5199 if (wlen != NULL) {
5200 *wlen = s - orig_s;
5201 }
5202 return -2;
5203 }
5204 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005206 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005208 if (wlen) {
5209 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005210 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005211 *wstr = unicode;
5212 return 0;
5213}
5214
Victor Stinner5f9cf232019-03-19 01:46:25 +01005215
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005216wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005217_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5218 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219{
5220 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005221 int res = _Py_DecodeUTF8Ex(arg, arglen,
5222 &wstr, wlen,
5223 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005224 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005225 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5226 assert(res != -3);
5227 if (wlen) {
5228 *wlen = (size_t)res;
5229 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005230 return NULL;
5231 }
5232 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005233}
5234
Antoine Pitrouab868312009-01-10 15:40:25 +00005235
Victor Stinnere47e6982017-12-21 15:45:16 +01005236/* UTF-8 encoder using the surrogateescape error handler .
5237
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005238 On success, return 0 and write the newly allocated character string (use
5239 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005240
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005241 On encoding failure, return -2 and write the position of the invalid
5242 surrogate character into *error_pos (if error_pos is set) and the decoding
5243 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005244
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 On memory allocation failure, return -1. */
5246int
5247_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005248 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005249{
5250 const Py_ssize_t max_char_size = 4;
5251 Py_ssize_t len = wcslen(text);
5252
5253 assert(len >= 0);
5254
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 int surrogateescape = 0;
5256 int surrogatepass = 0;
5257 switch (errors)
5258 {
5259 case _Py_ERROR_STRICT:
5260 break;
5261 case _Py_ERROR_SURROGATEESCAPE:
5262 surrogateescape = 1;
5263 break;
5264 case _Py_ERROR_SURROGATEPASS:
5265 surrogatepass = 1;
5266 break;
5267 default:
5268 return -3;
5269 }
5270
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005271 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5272 return -1;
5273 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005274 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005275 if (raw_malloc) {
5276 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005277 }
5278 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005279 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005280 }
5281 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005283 }
5284
5285 char *p = bytes;
5286 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005287 for (i = 0; i < len; ) {
5288 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005290 i++;
5291#if Py_UNICODE_SIZE == 2
5292 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5293 && i < len
5294 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5295 {
5296 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5297 i++;
5298 }
5299#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005300
5301 if (ch < 0x80) {
5302 /* Encode ASCII */
5303 *p++ = (char) ch;
5304
5305 }
5306 else if (ch < 0x0800) {
5307 /* Encode Latin-1 */
5308 *p++ = (char)(0xc0 | (ch >> 6));
5309 *p++ = (char)(0x80 | (ch & 0x3f));
5310 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005311 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005312 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005313 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005315 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005316 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005317 if (reason != NULL) {
5318 *reason = "encoding error";
5319 }
5320 if (raw_malloc) {
5321 PyMem_RawFree(bytes);
5322 }
5323 else {
5324 PyMem_Free(bytes);
5325 }
5326 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005327 }
5328 *p++ = (char)(ch & 0xff);
5329 }
5330 else if (ch < 0x10000) {
5331 *p++ = (char)(0xe0 | (ch >> 12));
5332 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5333 *p++ = (char)(0x80 | (ch & 0x3f));
5334 }
5335 else { /* ch >= 0x10000 */
5336 assert(ch <= MAX_UNICODE);
5337 /* Encode UCS4 Unicode ordinals */
5338 *p++ = (char)(0xf0 | (ch >> 18));
5339 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5340 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5341 *p++ = (char)(0x80 | (ch & 0x3f));
5342 }
5343 }
5344 *p++ = '\0';
5345
5346 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005347 char *bytes2;
5348 if (raw_malloc) {
5349 bytes2 = PyMem_RawRealloc(bytes, final_size);
5350 }
5351 else {
5352 bytes2 = PyMem_Realloc(bytes, final_size);
5353 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005354 if (bytes2 == NULL) {
5355 if (error_pos != NULL) {
5356 *error_pos = (size_t)-1;
5357 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005358 if (raw_malloc) {
5359 PyMem_RawFree(bytes);
5360 }
5361 else {
5362 PyMem_Free(bytes);
5363 }
5364 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005365 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005366 *str = bytes2;
5367 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005368}
5369
5370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371/* Primary internal function which creates utf8 encoded bytes objects.
5372
5373 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005374 and allocate exactly as much space needed at the end. Else allocate the
5375 maximum possible needed (4 result bytes per Unicode character), and return
5376 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005377*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005378static PyObject *
5379unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381{
Victor Stinner6099a032011-12-18 14:22:26 +01005382 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 void *data;
5384 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 if (!PyUnicode_Check(unicode)) {
5387 PyErr_BadArgument();
5388 return NULL;
5389 }
5390
5391 if (PyUnicode_READY(unicode) == -1)
5392 return NULL;
5393
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005394 if (PyUnicode_UTF8(unicode))
5395 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5396 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397
5398 kind = PyUnicode_KIND(unicode);
5399 data = PyUnicode_DATA(unicode);
5400 size = PyUnicode_GET_LENGTH(unicode);
5401
Benjamin Petersonead6b532011-12-20 17:23:42 -06005402 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005403 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005404 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005405 case PyUnicode_1BYTE_KIND:
5406 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5407 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005408 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005409 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005410 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005411 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005412 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414}
5415
Alexander Belopolsky40018472011-02-26 01:02:56 +00005416PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005417_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5418{
5419 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5420}
5421
5422
5423PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005424PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5425 Py_ssize_t size,
5426 const char *errors)
5427{
5428 PyObject *v, *unicode;
5429
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005430 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 if (unicode == NULL)
5432 return NULL;
5433 v = _PyUnicode_AsUTF8String(unicode, errors);
5434 Py_DECREF(unicode);
5435 return v;
5436}
5437
5438PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005439PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
Walter Dörwald41980ca2007-08-16 21:55:45 +00005444/* --- UTF-32 Codec ------------------------------------------------------- */
5445
5446PyObject *
5447PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 Py_ssize_t size,
5449 const char *errors,
5450 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005451{
5452 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5453}
5454
5455PyObject *
5456PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 Py_ssize_t size,
5458 const char *errors,
5459 int *byteorder,
5460 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005461{
5462 const char *starts = s;
5463 Py_ssize_t startinpos;
5464 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005465 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005466 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005467 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005468 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005469 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005470 PyObject *errorHandler = NULL;
5471 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005472
Walter Dörwald41980ca2007-08-16 21:55:45 +00005473 q = (unsigned char *)s;
5474 e = q + size;
5475
5476 if (byteorder)
5477 bo = *byteorder;
5478
5479 /* Check for BOM marks (U+FEFF) in the input and adjust current
5480 byte order setting accordingly. In native mode, the leading BOM
5481 mark is skipped, in all other modes, it is copied to the output
5482 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005484 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 if (bom == 0x0000FEFF) {
5486 bo = -1;
5487 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005489 else if (bom == 0xFFFE0000) {
5490 bo = 1;
5491 q += 4;
5492 }
5493 if (byteorder)
5494 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005495 }
5496
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 if (q == e) {
5498 if (consumed)
5499 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005500 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005501 }
5502
Victor Stinnere64322e2012-10-30 23:12:47 +01005503#ifdef WORDS_BIGENDIAN
5504 le = bo < 0;
5505#else
5506 le = bo <= 0;
5507#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005508 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005509
Victor Stinner8f674cc2013-04-17 23:02:17 +02005510 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005511 writer.min_length = (e - q + 3) / 4;
5512 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005513 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005514
Victor Stinnere64322e2012-10-30 23:12:47 +01005515 while (1) {
5516 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005518
Victor Stinnere64322e2012-10-30 23:12:47 +01005519 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 enum PyUnicode_Kind kind = writer.kind;
5521 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005522 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005523 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005524 if (le) {
5525 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005526 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005527 if (ch > maxch)
5528 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005529 if (kind != PyUnicode_1BYTE_KIND &&
5530 Py_UNICODE_IS_SURROGATE(ch))
5531 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 q += 4;
5534 } while (q <= last);
5535 }
5536 else {
5537 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005538 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 if (ch > maxch)
5540 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005541 if (kind != PyUnicode_1BYTE_KIND &&
5542 Py_UNICODE_IS_SURROGATE(ch))
5543 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005544 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005545 q += 4;
5546 } while (q <= last);
5547 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005548 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005549 }
5550
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005551 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005552 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 startinpos = ((const char *)q) - starts;
5554 endinpos = startinpos + 4;
5555 }
5556 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005557 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005559 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005561 startinpos = ((const char *)q) - starts;
5562 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005564 else {
5565 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005566 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005567 goto onError;
5568 q += 4;
5569 continue;
5570 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005571 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 startinpos = ((const char *)q) - starts;
5573 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005575
5576 /* The remaining input chars are ignored if the callback
5577 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005578 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005582 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005584 }
5585
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005587 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588
Walter Dörwald41980ca2007-08-16 21:55:45 +00005589 Py_XDECREF(errorHandler);
5590 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005591 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005592
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005594 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 Py_XDECREF(errorHandler);
5596 Py_XDECREF(exc);
5597 return NULL;
5598}
5599
5600PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005601_PyUnicode_EncodeUTF32(PyObject *str,
5602 const char *errors,
5603 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005604{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005605 enum PyUnicode_Kind kind;
5606 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005607 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005608 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005609 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005610#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005612#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005613 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005615 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005616 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 PyObject *errorHandler = NULL;
5618 PyObject *exc = NULL;
5619 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005620
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 if (!PyUnicode_Check(str)) {
5622 PyErr_BadArgument();
5623 return NULL;
5624 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005625 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005626 return NULL;
5627 kind = PyUnicode_KIND(str);
5628 data = PyUnicode_DATA(str);
5629 len = PyUnicode_GET_LENGTH(str);
5630
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005631 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005632 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005633 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005634 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005635 if (v == NULL)
5636 return NULL;
5637
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005638 /* output buffer is 4-bytes aligned */
5639 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005640 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005641 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005642 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005645
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 else
5651 encoding = "utf-32";
5652
5653 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005654 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5655 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005656 }
5657
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 pos = 0;
5659 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661
5662 if (kind == PyUnicode_2BYTE_KIND) {
5663 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5664 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005666 else {
5667 assert(kind == PyUnicode_4BYTE_KIND);
5668 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5669 &out, native_ordering);
5670 }
5671 if (pos == len)
5672 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005673
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005674 rep = unicode_encode_call_errorhandler(
5675 errors, &errorHandler,
5676 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005677 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005678 if (!rep)
5679 goto error;
5680
5681 if (PyBytes_Check(rep)) {
5682 repsize = PyBytes_GET_SIZE(rep);
5683 if (repsize & 3) {
5684 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005685 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005686 "surrogates not allowed");
5687 goto error;
5688 }
5689 moreunits = repsize / 4;
5690 }
5691 else {
5692 assert(PyUnicode_Check(rep));
5693 if (PyUnicode_READY(rep) < 0)
5694 goto error;
5695 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5696 if (!PyUnicode_IS_ASCII(rep)) {
5697 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005698 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005699 "surrogates not allowed");
5700 goto error;
5701 }
5702 }
5703
5704 /* four bytes are reserved for each surrogate */
5705 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005706 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005707 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005708 /* integer overflow */
5709 PyErr_NoMemory();
5710 goto error;
5711 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005712 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005714 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 }
5716
5717 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005718 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005719 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005720 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005721 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005722 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5723 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 }
5725
5726 Py_CLEAR(rep);
5727 }
5728
5729 /* Cut back to size actually needed. This is necessary for, for example,
5730 encoding of a string containing isolated surrogates and the 'ignore'
5731 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005732 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005733 if (nsize != PyBytes_GET_SIZE(v))
5734 _PyBytes_Resize(&v, nsize);
5735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005738 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 error:
5740 Py_XDECREF(rep);
5741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
5743 Py_XDECREF(v);
5744 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005745}
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005748PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5749 Py_ssize_t size,
5750 const char *errors,
5751 int byteorder)
5752{
5753 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005754 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 if (tmp == NULL)
5756 return NULL;
5757 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5758 Py_DECREF(tmp);
5759 return result;
5760}
5761
5762PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005763PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005764{
Victor Stinnerb960b342011-11-20 19:12:52 +01005765 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005766}
5767
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768/* --- UTF-16 Codec ------------------------------------------------------- */
5769
Tim Peters772747b2001-08-09 22:21:55 +00005770PyObject *
5771PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 Py_ssize_t size,
5773 const char *errors,
5774 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775{
Walter Dörwald69652032004-09-07 20:24:22 +00005776 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5777}
5778
5779PyObject *
5780PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 Py_ssize_t size,
5782 const char *errors,
5783 int *byteorder,
5784 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005787 Py_ssize_t startinpos;
5788 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005791 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005792 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005793 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 PyObject *errorHandler = NULL;
5795 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Tim Peters772747b2001-08-09 22:21:55 +00005798 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
5801 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005802 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005804 /* Check for BOM marks (U+FEFF) in the input and adjust current
5805 byte order setting accordingly. In native mode, the leading BOM
5806 mark is skipped, in all other modes, it is copied to the output
5807 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005808 if (bo == 0 && size >= 2) {
5809 const Py_UCS4 bom = (q[1] << 8) | q[0];
5810 if (bom == 0xFEFF) {
5811 q += 2;
5812 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005814 else if (bom == 0xFFFE) {
5815 q += 2;
5816 bo = 1;
5817 }
5818 if (byteorder)
5819 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
Antoine Pitrou63065d72012-05-15 23:48:04 +02005822 if (q == e) {
5823 if (consumed)
5824 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005825 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005826 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005827
Christian Heimes743e0cd2012-10-17 23:52:17 +02005828#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005830 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005831#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005832 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005833 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005834#endif
Tim Peters772747b2001-08-09 22:21:55 +00005835
Antoine Pitrou63065d72012-05-15 23:48:04 +02005836 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005837 character count normally. Error handler will take care of
5838 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005839 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005840 writer.min_length = (e - q + 1) / 2;
5841 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843
Antoine Pitrou63065d72012-05-15 23:48:04 +02005844 while (1) {
5845 Py_UCS4 ch = 0;
5846 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005847 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005848 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005852 native_ordering);
5853 else
5854 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005856 native_ordering);
5857 } else if (kind == PyUnicode_2BYTE_KIND) {
5858 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005859 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005860 native_ordering);
5861 } else {
5862 assert(kind == PyUnicode_4BYTE_KIND);
5863 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005865 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005866 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868
Antoine Pitrou63065d72012-05-15 23:48:04 +02005869 switch (ch)
5870 {
5871 case 0:
5872 /* remaining byte at the end? (size should be even) */
5873 if (q == e || consumed)
5874 goto End;
5875 errmsg = "truncated data";
5876 startinpos = ((const char *)q) - starts;
5877 endinpos = ((const char *)e) - starts;
5878 break;
5879 /* The remaining input chars are ignored if the callback
5880 chooses to skip the input */
5881 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005882 q -= 2;
5883 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005884 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005885 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005886 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 endinpos = ((const char *)e) - starts;
5888 break;
5889 case 2:
5890 errmsg = "illegal encoding";
5891 startinpos = ((const char *)q) - 2 - starts;
5892 endinpos = startinpos + 2;
5893 break;
5894 case 3:
5895 errmsg = "illegal UTF-16 surrogate";
5896 startinpos = ((const char *)q) - 4 - starts;
5897 endinpos = startinpos + 2;
5898 break;
5899 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005900 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005901 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 continue;
5903 }
5904
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005905 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005906 errors,
5907 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005908 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005909 &starts,
5910 (const char **)&e,
5911 &startinpos,
5912 &endinpos,
5913 &exc,
5914 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005915 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 }
5918
Antoine Pitrou63065d72012-05-15 23:48:04 +02005919End:
Walter Dörwald69652032004-09-07 20:24:22 +00005920 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005922
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 Py_XDECREF(errorHandler);
5924 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005925 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005928 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 Py_XDECREF(errorHandler);
5930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 return NULL;
5932}
5933
Tim Peters772747b2001-08-09 22:21:55 +00005934PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005935_PyUnicode_EncodeUTF16(PyObject *str,
5936 const char *errors,
5937 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005939 enum PyUnicode_Kind kind;
5940 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005942 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005943 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005944 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005945#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005946 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005947#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005948 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005949#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 const char *encoding;
5951 Py_ssize_t nsize, pos;
5952 PyObject *errorHandler = NULL;
5953 PyObject *exc = NULL;
5954 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005955
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 if (!PyUnicode_Check(str)) {
5957 PyErr_BadArgument();
5958 return NULL;
5959 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005960 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 return NULL;
5962 kind = PyUnicode_KIND(str);
5963 data = PyUnicode_DATA(str);
5964 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005965
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005966 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005967 if (kind == PyUnicode_4BYTE_KIND) {
5968 const Py_UCS4 *in = (const Py_UCS4 *)data;
5969 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005970 while (in < end) {
5971 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005973 }
5974 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005975 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005976 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005978 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005979 nsize = len + pairs + (byteorder == 0);
5980 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005981 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005985 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005986 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005988 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005990 }
5991 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005992 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005993 }
Tim Peters772747b2001-08-09 22:21:55 +00005994
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005995 if (kind == PyUnicode_1BYTE_KIND) {
5996 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5997 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005998 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005999
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006000 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006002 }
6003 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006004 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006005 }
6006 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006007 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006008 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009
6010 pos = 0;
6011 while (pos < len) {
6012 Py_ssize_t repsize, moreunits;
6013
6014 if (kind == PyUnicode_2BYTE_KIND) {
6015 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6016 &out, native_ordering);
6017 }
6018 else {
6019 assert(kind == PyUnicode_4BYTE_KIND);
6020 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6021 &out, native_ordering);
6022 }
6023 if (pos == len)
6024 break;
6025
6026 rep = unicode_encode_call_errorhandler(
6027 errors, &errorHandler,
6028 encoding, "surrogates not allowed",
6029 str, &exc, pos, pos + 1, &pos);
6030 if (!rep)
6031 goto error;
6032
6033 if (PyBytes_Check(rep)) {
6034 repsize = PyBytes_GET_SIZE(rep);
6035 if (repsize & 1) {
6036 raise_encode_exception(&exc, encoding,
6037 str, pos - 1, pos,
6038 "surrogates not allowed");
6039 goto error;
6040 }
6041 moreunits = repsize / 2;
6042 }
6043 else {
6044 assert(PyUnicode_Check(rep));
6045 if (PyUnicode_READY(rep) < 0)
6046 goto error;
6047 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6048 if (!PyUnicode_IS_ASCII(rep)) {
6049 raise_encode_exception(&exc, encoding,
6050 str, pos - 1, pos,
6051 "surrogates not allowed");
6052 goto error;
6053 }
6054 }
6055
6056 /* two bytes are reserved for each surrogate */
6057 if (moreunits > 1) {
6058 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006059 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006060 /* integer overflow */
6061 PyErr_NoMemory();
6062 goto error;
6063 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006064 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006065 goto error;
6066 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6067 }
6068
6069 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006070 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006071 out += moreunits;
6072 } else /* rep is unicode */ {
6073 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075 &out, native_ordering);
6076 }
6077
6078 Py_CLEAR(rep);
6079 }
6080
6081 /* Cut back to size actually needed. This is necessary for, for example,
6082 encoding of a string containing isolated surrogates and the 'ignore' handler
6083 is used. */
6084 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6085 if (nsize != PyBytes_GET_SIZE(v))
6086 _PyBytes_Resize(&v, nsize);
6087 Py_XDECREF(errorHandler);
6088 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006089 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006090 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006091 error:
6092 Py_XDECREF(rep);
6093 Py_XDECREF(errorHandler);
6094 Py_XDECREF(exc);
6095 Py_XDECREF(v);
6096 return NULL;
6097#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
Alexander Belopolsky40018472011-02-26 01:02:56 +00006100PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6102 Py_ssize_t size,
6103 const char *errors,
6104 int byteorder)
6105{
6106 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006107 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108 if (tmp == NULL)
6109 return NULL;
6110 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6111 Py_DECREF(tmp);
6112 return result;
6113}
6114
6115PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006116PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119}
6120
6121/* --- Unicode Escape Codec ----------------------------------------------- */
6122
Fredrik Lundh06d12682001-01-24 07:59:11 +00006123static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006124
Alexander Belopolsky40018472011-02-26 01:02:56 +00006125PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006126_PyUnicode_DecodeUnicodeEscape(const char *s,
6127 Py_ssize_t size,
6128 const char *errors,
6129 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 PyObject *errorHandler = NULL;
6135 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006136
Eric V. Smith42454af2016-10-31 09:22:08 -04006137 // so we can remember if we've seen an invalid escape char or not
6138 *first_invalid_escape = NULL;
6139
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006141 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 }
6143 /* Escaped strings will always be longer than the resulting
6144 Unicode string, so we start with size here and then reduce the
6145 length after conversion to the true value.
6146 (but if the error callback returns a long replacement string
6147 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006148 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 writer.min_length = size;
6150 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6151 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006152 }
6153
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 end = s + size;
6155 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 unsigned char c = (unsigned char) *s++;
6157 Py_UCS4 ch;
6158 int count;
6159 Py_ssize_t startinpos;
6160 Py_ssize_t endinpos;
6161 const char *message;
6162
6163#define WRITE_ASCII_CHAR(ch) \
6164 do { \
6165 assert(ch <= 127); \
6166 assert(writer.pos < writer.size); \
6167 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6168 } while(0)
6169
6170#define WRITE_CHAR(ch) \
6171 do { \
6172 if (ch <= writer.maxchar) { \
6173 assert(writer.pos < writer.size); \
6174 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6175 } \
6176 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6177 goto onError; \
6178 } \
6179 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
6181 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006182 if (c != '\\') {
6183 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 continue;
6185 }
6186
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (s >= end) {
6190 message = "\\ at end of string";
6191 goto error;
6192 }
6193 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006194
Victor Stinner62ec3312016-09-06 17:04:34 -07006195 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006196 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 case '\n': continue;
6200 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6201 case '\'': WRITE_ASCII_CHAR('\''); continue;
6202 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6203 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006204 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006205 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6206 case 't': WRITE_ASCII_CHAR('\t'); continue;
6207 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6208 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006209 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006211 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 case '0': case '1': case '2': case '3':
6216 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006218 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 ch = (ch<<3) + *s++ - '0';
6220 if (s < end && '0' <= *s && *s <= '7') {
6221 ch = (ch<<3) + *s++ - '0';
6222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 WRITE_CHAR(ch);
6225 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 /* hex escapes */
6228 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006231 message = "truncated \\xXX escape";
6232 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006237 message = "truncated \\uXXXX escape";
6238 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006241 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006242 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006243 message = "truncated \\UXXXXXXXX escape";
6244 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006246 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 ch <<= 4;
6248 if (c >= '0' && c <= '9') {
6249 ch += c - '0';
6250 }
6251 else if (c >= 'a' && c <= 'f') {
6252 ch += c - ('a' - 10);
6253 }
6254 else if (c >= 'A' && c <= 'F') {
6255 ch += c - ('A' - 10);
6256 }
6257 else {
6258 break;
6259 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006260 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006262 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 }
6264
6265 /* when we get here, ch is a 32-bit unicode character */
6266 if (ch > MAX_UNICODE) {
6267 message = "illegal Unicode character";
6268 goto error;
6269 }
6270
6271 WRITE_CHAR(ch);
6272 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006275 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006276 if (ucnhash_CAPI == NULL) {
6277 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006278 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6279 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006280 if (ucnhash_CAPI == NULL) {
6281 PyErr_SetString(
6282 PyExc_UnicodeError,
6283 "\\N escapes not supported (can't load unicodedata module)"
6284 );
6285 goto onError;
6286 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006287 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006288
6289 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006290 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 const char *start = ++s;
6292 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006293 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006294 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006295 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 namelen = s - start;
6297 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006298 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006299 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006300 ch = 0xffffffff; /* in case 'getcode' messes up */
6301 if (namelen <= INT_MAX &&
6302 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6303 &ch, 0)) {
6304 assert(ch <= MAX_UNICODE);
6305 WRITE_CHAR(ch);
6306 continue;
6307 }
6308 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006309 }
6310 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006311 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006312
6313 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006314 if (*first_invalid_escape == NULL) {
6315 *first_invalid_escape = s-1; /* Back up one char, since we've
6316 already incremented s. */
6317 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006318 WRITE_ASCII_CHAR('\\');
6319 WRITE_CHAR(c);
6320 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006322
6323 error:
6324 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006326 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006327 errors, &errorHandler,
6328 "unicodeescape", message,
6329 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006331 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006333 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006334
6335#undef WRITE_ASCII_CHAR
6336#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006338
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006339 Py_XDECREF(errorHandler);
6340 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006341 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006344 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 return NULL;
6348}
6349
Eric V. Smith42454af2016-10-31 09:22:08 -04006350PyObject *
6351PyUnicode_DecodeUnicodeEscape(const char *s,
6352 Py_ssize_t size,
6353 const char *errors)
6354{
6355 const char *first_invalid_escape;
6356 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6357 &first_invalid_escape);
6358 if (result == NULL)
6359 return NULL;
6360 if (first_invalid_escape != NULL) {
6361 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6362 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006363 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006364 Py_DECREF(result);
6365 return NULL;
6366 }
6367 }
6368 return result;
6369}
6370
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006371/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006374PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006377 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006380 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382
Ezio Melottie7f90372012-10-05 03:33:31 +03006383 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006384 escape.
6385
Ezio Melottie7f90372012-10-05 03:33:31 +03006386 For UCS1 strings it's '\xxx', 4 bytes per source character.
6387 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6388 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006389 */
6390
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391 if (!PyUnicode_Check(unicode)) {
6392 PyErr_BadArgument();
6393 return NULL;
6394 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006396 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 }
Victor Stinner358af132015-10-12 22:36:57 +02006398
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006399 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006400 if (len == 0) {
6401 return PyBytes_FromStringAndSize(NULL, 0);
6402 }
6403
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006404 kind = PyUnicode_KIND(unicode);
6405 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6407 bytes, and 1 byte characters 4. */
6408 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006409 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 return PyErr_NoMemory();
6411 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006412 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 if (repr == NULL) {
6414 return NULL;
6415 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006416
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006419 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006420
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 /* U+0000-U+00ff range */
6422 if (ch < 0x100) {
6423 if (ch >= ' ' && ch < 127) {
6424 if (ch != '\\') {
6425 /* Copy printable US ASCII as-is */
6426 *p++ = (char) ch;
6427 }
6428 /* Escape backslashes */
6429 else {
6430 *p++ = '\\';
6431 *p++ = '\\';
6432 }
6433 }
Victor Stinner358af132015-10-12 22:36:57 +02006434
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 /* Map special whitespace to '\t', \n', '\r' */
6436 else if (ch == '\t') {
6437 *p++ = '\\';
6438 *p++ = 't';
6439 }
6440 else if (ch == '\n') {
6441 *p++ = '\\';
6442 *p++ = 'n';
6443 }
6444 else if (ch == '\r') {
6445 *p++ = '\\';
6446 *p++ = 'r';
6447 }
6448
6449 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6450 else {
6451 *p++ = '\\';
6452 *p++ = 'x';
6453 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6454 *p++ = Py_hexdigits[ch & 0x000F];
6455 }
Tim Petersced69f82003-09-16 20:30:58 +00006456 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006457 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 *p++ = '\\';
6460 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006461 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6462 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6463 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6464 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6467 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006468
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 /* Make sure that the first two digits are zero */
6470 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006471 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 *p++ = 'U';
6473 *p++ = '0';
6474 *p++ = '0';
6475 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6476 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6477 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6478 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6479 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6480 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 assert(p - PyBytes_AS_STRING(repr) > 0);
6485 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6486 return NULL;
6487 }
6488 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489}
6490
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6493 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006495 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006496 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 }
6500
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006501 result = PyUnicode_AsUnicodeEscapeString(tmp);
6502 Py_DECREF(tmp);
6503 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
6506/* --- Raw Unicode Escape Codec ------------------------------------------- */
6507
Alexander Belopolsky40018472011-02-26 01:02:56 +00006508PyObject *
6509PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006510 Py_ssize_t size,
6511 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006514 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 PyObject *errorHandler = NULL;
6517 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006518
Victor Stinner62ec3312016-09-06 17:04:34 -07006519 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006520 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006522
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 /* Escaped strings will always be longer than the resulting
6524 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006525 length after conversion to the true value. (But decoding error
6526 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006527 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006528 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6530 goto onError;
6531 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 end = s + size;
6534 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 unsigned char c = (unsigned char) *s++;
6536 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006537 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006538 Py_ssize_t startinpos;
6539 Py_ssize_t endinpos;
6540 const char *message;
6541
6542#define WRITE_CHAR(ch) \
6543 do { \
6544 if (ch <= writer.maxchar) { \
6545 assert(writer.pos < writer.size); \
6546 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6547 } \
6548 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6549 goto onError; \
6550 } \
6551 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 if (c != '\\' || s >= end) {
6555 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006558
Victor Stinner62ec3312016-09-06 17:04:34 -07006559 c = (unsigned char) *s++;
6560 if (c == 'u') {
6561 count = 4;
6562 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006564 else if (c == 'U') {
6565 count = 8;
6566 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006567 }
6568 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 assert(writer.pos < writer.size);
6570 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6571 WRITE_CHAR(c);
6572 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006573 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006574 startinpos = s - starts - 2;
6575
6576 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6577 for (ch = 0; count && s < end; ++s, --count) {
6578 c = (unsigned char)*s;
6579 ch <<= 4;
6580 if (c >= '0' && c <= '9') {
6581 ch += c - '0';
6582 }
6583 else if (c >= 'a' && c <= 'f') {
6584 ch += c - ('a' - 10);
6585 }
6586 else if (c >= 'A' && c <= 'F') {
6587 ch += c - ('A' - 10);
6588 }
6589 else {
6590 break;
6591 }
6592 }
6593 if (!count) {
6594 if (ch <= MAX_UNICODE) {
6595 WRITE_CHAR(ch);
6596 continue;
6597 }
6598 message = "\\Uxxxxxxxx out of range";
6599 }
6600
6601 endinpos = s-starts;
6602 writer.min_length = end - s + writer.pos;
6603 if (unicode_decode_call_errorhandler_writer(
6604 errors, &errorHandler,
6605 "rawunicodeescape", message,
6606 &starts, &end, &startinpos, &endinpos, &exc, &s,
6607 &writer)) {
6608 goto onError;
6609 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006610 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006611
6612#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006614 Py_XDECREF(errorHandler);
6615 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006616 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006617
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006619 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 Py_XDECREF(errorHandler);
6621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633 int kind;
6634 void *data;
6635 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006637 if (!PyUnicode_Check(unicode)) {
6638 PyErr_BadArgument();
6639 return NULL;
6640 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006642 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 kind = PyUnicode_KIND(unicode);
6645 data = PyUnicode_DATA(unicode);
6646 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 if (kind == PyUnicode_1BYTE_KIND) {
6648 return PyBytes_FromStringAndSize(data, len);
6649 }
Victor Stinner0e368262011-11-10 20:12:49 +01006650
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6652 bytes, and 1 byte characters 4. */
6653 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006654
Victor Stinner62ec3312016-09-06 17:04:34 -07006655 if (len > PY_SSIZE_T_MAX / expandsize) {
6656 return PyErr_NoMemory();
6657 }
6658 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6659 if (repr == NULL) {
6660 return NULL;
6661 }
6662 if (len == 0) {
6663 return repr;
6664 }
6665
6666 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006667 for (pos = 0; pos < len; pos++) {
6668 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006669
Victor Stinner62ec3312016-09-06 17:04:34 -07006670 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6671 if (ch < 0x100) {
6672 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006673 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006674 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006675 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 *p++ = '\\';
6677 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006678 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6679 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6680 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6681 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006683 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6684 else {
6685 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6686 *p++ = '\\';
6687 *p++ = 'U';
6688 *p++ = '0';
6689 *p++ = '0';
6690 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6691 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6692 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6693 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6694 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6695 *p++ = Py_hexdigits[ch & 15];
6696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006698
Victor Stinner62ec3312016-09-06 17:04:34 -07006699 assert(p > PyBytes_AS_STRING(repr));
6700 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6701 return NULL;
6702 }
6703 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006707PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6708 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006710 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006711 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006712 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006713 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6715 Py_DECREF(tmp);
6716 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717}
6718
6719/* --- Latin-1 Codec ------------------------------------------------------ */
6720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721PyObject *
6722PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006723 Py_ssize_t size,
6724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006727 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731static void
6732make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006733 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006734 PyObject *unicode,
6735 Py_ssize_t startpos, Py_ssize_t endpos,
6736 const char *reason)
6737{
6738 if (*exceptionObject == NULL) {
6739 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006740 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006741 encoding, unicode, startpos, endpos, reason);
6742 }
6743 else {
6744 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6745 goto onError;
6746 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6747 goto onError;
6748 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6749 goto onError;
6750 return;
6751 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006752 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006753 }
6754}
6755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006756/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757static void
6758raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006759 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006760 PyObject *unicode,
6761 Py_ssize_t startpos, Py_ssize_t endpos,
6762 const char *reason)
6763{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006764 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006765 encoding, unicode, startpos, endpos, reason);
6766 if (*exceptionObject != NULL)
6767 PyCodec_StrictErrors(*exceptionObject);
6768}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769
6770/* error handling callback helper:
6771 build arguments, call the callback and check the arguments,
6772 put the result into newpos and return the replacement string, which
6773 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006774static PyObject *
6775unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006776 PyObject **errorHandler,
6777 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006779 Py_ssize_t startpos, Py_ssize_t endpos,
6780 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006782 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006783 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 PyObject *restuple;
6785 PyObject *resunicode;
6786
6787 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 }
6792
Benjamin Petersonbac79492012-01-14 13:34:47 -05006793 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 return NULL;
6795 len = PyUnicode_GET_LENGTH(unicode);
6796
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006797 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006802 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006804 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006806 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 Py_DECREF(restuple);
6808 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006809 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006810 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 &resunicode, newpos)) {
6812 Py_DECREF(restuple);
6813 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006814 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006815 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6816 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6817 Py_DECREF(restuple);
6818 return NULL;
6819 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 *newpos = len + *newpos;
6822 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006823 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 Py_DECREF(restuple);
6825 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006827 Py_INCREF(resunicode);
6828 Py_DECREF(restuple);
6829 return resunicode;
6830}
6831
Alexander Belopolsky40018472011-02-26 01:02:56 +00006832static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006833unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006834 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006835 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006836{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006837 /* input state */
6838 Py_ssize_t pos=0, size;
6839 int kind;
6840 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006841 /* pointer into the output */
6842 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006843 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6844 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006845 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006846 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006847 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006848 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006849 /* output object */
6850 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851
Benjamin Petersonbac79492012-01-14 13:34:47 -05006852 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006853 return NULL;
6854 size = PyUnicode_GET_LENGTH(unicode);
6855 kind = PyUnicode_KIND(unicode);
6856 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 /* allocate enough for a simple encoding without
6858 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006859 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006860 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006861
6862 _PyBytesWriter_Init(&writer);
6863 str = _PyBytesWriter_Alloc(&writer, size);
6864 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006865 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006867 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006868 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006871 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006873 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006875 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006877 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006879 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006880 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006882
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006883 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006885
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006886 /* Only overallocate the buffer if it's not the last write */
6887 writer.overallocate = (collend < size);
6888
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006890 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006891 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006892
6893 switch (error_handler) {
6894 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006895 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006897
6898 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006899 memset(str, '?', collend - collstart);
6900 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006901 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006902 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006903 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 break;
Victor Stinner50149202015-09-22 00:26:54 +02006905
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006906 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006907 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006908 writer.min_size -= (collend - collstart);
6909 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006910 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006911 if (str == NULL)
6912 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006913 pos = collend;
6914 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006915
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006916 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006917 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006918 writer.min_size -= (collend - collstart);
6919 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006920 unicode, collstart, collend);
6921 if (str == NULL)
6922 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006923 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 break;
Victor Stinner50149202015-09-22 00:26:54 +02006925
Victor Stinnerc3713e92015-09-29 12:32:13 +02006926 case _Py_ERROR_SURROGATEESCAPE:
6927 for (i = collstart; i < collend; ++i) {
6928 ch = PyUnicode_READ(kind, data, i);
6929 if (ch < 0xdc80 || 0xdcff < ch) {
6930 /* Not a UTF-8b surrogate */
6931 break;
6932 }
6933 *str++ = (char)(ch - 0xdc00);
6934 ++pos;
6935 }
6936 if (i >= collend)
6937 break;
6938 collstart = pos;
6939 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006940 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006941
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006943 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6944 encoding, reason, unicode, &exc,
6945 collstart, collend, &newpos);
6946 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006948
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006949 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006950 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006951
Victor Stinner6bd525b2015-10-09 13:10:05 +02006952 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006953 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006954 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006955 PyBytes_AS_STRING(rep),
6956 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006957 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006958 else {
6959 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006960
Victor Stinner6bd525b2015-10-09 13:10:05 +02006961 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006963
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006964 if (limit == 256 ?
6965 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6966 !PyUnicode_IS_ASCII(rep))
6967 {
6968 /* Not all characters are smaller than limit */
6969 raise_encode_exception(&exc, encoding, unicode,
6970 collstart, collend, reason);
6971 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006973 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6974 str = _PyBytesWriter_WriteBytes(&writer, str,
6975 PyUnicode_DATA(rep),
6976 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006978 if (str == NULL)
6979 goto onError;
6980
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006981 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006982 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006983 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006984
6985 /* If overallocation was disabled, ensure that it was the last
6986 write. Otherwise, we missed an optimization */
6987 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988 }
6989 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006990
Victor Stinner50149202015-09-22 00:26:54 +02006991 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006992 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006993 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006994
6995 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006996 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006997 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006998 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006999 Py_XDECREF(exc);
7000 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001}
7002
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007003/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007004PyObject *
7005PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007006 Py_ssize_t size,
7007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007009 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007010 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011 if (unicode == NULL)
7012 return NULL;
7013 result = unicode_encode_ucs1(unicode, errors, 256);
7014 Py_DECREF(unicode);
7015 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016}
7017
Alexander Belopolsky40018472011-02-26 01:02:56 +00007018PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020{
7021 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 PyErr_BadArgument();
7023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007025 if (PyUnicode_READY(unicode) == -1)
7026 return NULL;
7027 /* Fast path: if it is a one-byte string, construct
7028 bytes object directly. */
7029 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7030 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7031 PyUnicode_GET_LENGTH(unicode));
7032 /* Non-Latin-1 characters present. Defer to above function to
7033 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007034 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007035}
7036
7037PyObject*
7038PyUnicode_AsLatin1String(PyObject *unicode)
7039{
7040 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041}
7042
7043/* --- 7-bit ASCII Codec -------------------------------------------------- */
7044
Alexander Belopolsky40018472011-02-26 01:02:56 +00007045PyObject *
7046PyUnicode_DecodeASCII(const char *s,
7047 Py_ssize_t size,
7048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007050 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007051 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007052 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007054 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007055
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007057 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007058
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007060 if (size == 1 && (unsigned char)s[0] < 128)
7061 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007062
Inada Naoki770847a2019-06-24 12:30:24 +09007063 // Shortcut for simple case
7064 PyObject *u = PyUnicode_New(size, 127);
7065 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007066 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007067 }
7068 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7069 if (outpos == size) {
7070 return u;
7071 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007072
Inada Naoki770847a2019-06-24 12:30:24 +09007073 _PyUnicodeWriter writer;
7074 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007075 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007076
Inada Naoki770847a2019-06-24 12:30:24 +09007077 s += outpos;
7078 int kind = writer.kind;
7079 void *data = writer.data;
7080 Py_ssize_t startinpos, endinpos;
7081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007082 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007083 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007084 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007085 PyUnicode_WRITE(kind, data, writer.pos, c);
7086 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007088 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007090
7091 /* byte outsize range 0x00..0x7f: call the error handler */
7092
7093 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007094 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007095
7096 switch (error_handler)
7097 {
7098 case _Py_ERROR_REPLACE:
7099 case _Py_ERROR_SURROGATEESCAPE:
7100 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007101 but we may switch to UCS2 at the first write */
7102 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7103 goto onError;
7104 kind = writer.kind;
7105 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007106
7107 if (error_handler == _Py_ERROR_REPLACE)
7108 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7109 else
7110 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7111 writer.pos++;
7112 ++s;
7113 break;
7114
7115 case _Py_ERROR_IGNORE:
7116 ++s;
7117 break;
7118
7119 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 startinpos = s-starts;
7121 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007122 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007123 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 "ascii", "ordinal not in range(128)",
7125 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007126 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007128 kind = writer.kind;
7129 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007132 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007134 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007135
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007137 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007138 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 return NULL;
7141}
7142
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007143/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007144PyObject *
7145PyUnicode_EncodeASCII(const Py_UNICODE *p,
7146 Py_ssize_t size,
7147 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007149 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007150 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007151 if (unicode == NULL)
7152 return NULL;
7153 result = unicode_encode_ucs1(unicode, errors, 128);
7154 Py_DECREF(unicode);
7155 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156}
7157
Alexander Belopolsky40018472011-02-26 01:02:56 +00007158PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007159_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
7161 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 PyErr_BadArgument();
7163 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007165 if (PyUnicode_READY(unicode) == -1)
7166 return NULL;
7167 /* Fast path: if it is an ASCII-only string, construct bytes object
7168 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007169 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007170 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7171 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007172 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007173}
7174
7175PyObject *
7176PyUnicode_AsASCIIString(PyObject *unicode)
7177{
7178 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179}
7180
Steve Dowercc16be82016-09-08 10:35:16 -07007181#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007182
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007183/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007184
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007185#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186#define NEED_RETRY
7187#endif
7188
Victor Stinner3a50e702011-10-18 21:21:00 +02007189#ifndef WC_ERR_INVALID_CHARS
7190# define WC_ERR_INVALID_CHARS 0x0080
7191#endif
7192
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007193static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007194code_page_name(UINT code_page, PyObject **obj)
7195{
7196 *obj = NULL;
7197 if (code_page == CP_ACP)
7198 return "mbcs";
7199 if (code_page == CP_UTF7)
7200 return "CP_UTF7";
7201 if (code_page == CP_UTF8)
7202 return "CP_UTF8";
7203
7204 *obj = PyBytes_FromFormat("cp%u", code_page);
7205 if (*obj == NULL)
7206 return NULL;
7207 return PyBytes_AS_STRING(*obj);
7208}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209
Victor Stinner3a50e702011-10-18 21:21:00 +02007210static DWORD
7211decode_code_page_flags(UINT code_page)
7212{
7213 if (code_page == CP_UTF7) {
7214 /* The CP_UTF7 decoder only supports flags=0 */
7215 return 0;
7216 }
7217 else
7218 return MB_ERR_INVALID_CHARS;
7219}
7220
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 * Decode a byte string from a Windows code page into unicode object in strict
7223 * mode.
7224 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007225 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7226 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007228static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007229decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007230 wchar_t **buf,
7231 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 const char *in,
7233 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007235 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007236 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007237 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007238
7239 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007241 while ((outsize = MultiByteToWideChar(code_page, flags,
7242 in, insize, NULL, 0)) <= 0)
7243 {
7244 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7245 goto error;
7246 }
7247 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7248 flags = 0;
7249 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007250
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007251 /* Extend a wchar_t* buffer */
7252 Py_ssize_t n = *bufsize; /* Get the current length */
7253 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7254 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007256 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257
7258 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7260 if (outsize <= 0)
7261 goto error;
7262 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007263
Victor Stinner3a50e702011-10-18 21:21:00 +02007264error:
7265 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7266 return -2;
7267 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007268 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007269}
7270
Victor Stinner3a50e702011-10-18 21:21:00 +02007271/*
7272 * Decode a byte string from a code page into unicode object with an error
7273 * handler.
7274 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007275 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 * UnicodeDecodeError exception and returns -1 on error.
7277 */
7278static int
7279decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007280 wchar_t **buf,
7281 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007282 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007283 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007284{
7285 const char *startin = in;
7286 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007287 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 /* Ideally, we should get reason from FormatMessage. This is the Windows
7289 2000 English version of the message. */
7290 const char *reason = "No mapping for the Unicode character exists "
7291 "in the target code page.";
7292 /* each step cannot decode more than 1 character, but a character can be
7293 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007294 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007295 int insize;
7296 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 PyObject *errorHandler = NULL;
7298 PyObject *exc = NULL;
7299 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007300 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 DWORD err;
7302 int ret = -1;
7303
7304 assert(size > 0);
7305
7306 encoding = code_page_name(code_page, &encoding_obj);
7307 if (encoding == NULL)
7308 return -1;
7309
Victor Stinner7d00cc12014-03-17 23:08:06 +01007310 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7312 UnicodeDecodeError. */
7313 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7314 if (exc != NULL) {
7315 PyCodec_StrictErrors(exc);
7316 Py_CLEAR(exc);
7317 }
7318 goto error;
7319 }
7320
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007321 /* Extend a wchar_t* buffer */
7322 Py_ssize_t n = *bufsize; /* Get the current length */
7323 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7324 PyErr_NoMemory();
7325 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007327 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7328 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007330 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007331
7332 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 while (in < endin)
7334 {
7335 /* Decode a character */
7336 insize = 1;
7337 do
7338 {
7339 outsize = MultiByteToWideChar(code_page, flags,
7340 in, insize,
7341 buffer, Py_ARRAY_LENGTH(buffer));
7342 if (outsize > 0)
7343 break;
7344 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007345 if (err == ERROR_INVALID_FLAGS && flags) {
7346 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7347 flags = 0;
7348 continue;
7349 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 if (err != ERROR_NO_UNICODE_TRANSLATION
7351 && err != ERROR_INSUFFICIENT_BUFFER)
7352 {
7353 PyErr_SetFromWindowsErr(0);
7354 goto error;
7355 }
7356 insize++;
7357 }
7358 /* 4=maximum length of a UTF-8 sequence */
7359 while (insize <= 4 && (in + insize) <= endin);
7360
7361 if (outsize <= 0) {
7362 Py_ssize_t startinpos, endinpos, outpos;
7363
Victor Stinner7d00cc12014-03-17 23:08:06 +01007364 /* last character in partial decode? */
7365 if (in + insize >= endin && !final)
7366 break;
7367
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 startinpos = in - startin;
7369 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007370 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007371 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 errors, &errorHandler,
7373 encoding, reason,
7374 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 {
7377 goto error;
7378 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007379 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 }
7381 else {
7382 in += insize;
7383 memcpy(out, buffer, outsize * sizeof(wchar_t));
7384 out += outsize;
7385 }
7386 }
7387
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007388 /* Shrink the buffer */
7389 assert(out - *buf <= *bufsize);
7390 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007391 /* (in - startin) <= size and size is an int */
7392 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007393
7394error:
7395 Py_XDECREF(encoding_obj);
7396 Py_XDECREF(errorHandler);
7397 Py_XDECREF(exc);
7398 return ret;
7399}
7400
Victor Stinner3a50e702011-10-18 21:21:00 +02007401static PyObject *
7402decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007403 const char *s, Py_ssize_t size,
7404 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007405{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007406 wchar_t *buf = NULL;
7407 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007408 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007409
Victor Stinner3a50e702011-10-18 21:21:00 +02007410 if (code_page < 0) {
7411 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7412 return NULL;
7413 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007414 if (size < 0) {
7415 PyErr_BadInternalCall();
7416 return NULL;
7417 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007418
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421
Victor Stinner76a31a62011-11-04 00:05:13 +01007422 do
7423 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007424#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007425 if (size > INT_MAX) {
7426 chunk_size = INT_MAX;
7427 final = 0;
7428 done = 0;
7429 }
7430 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007432 {
7433 chunk_size = (int)size;
7434 final = (consumed == NULL);
7435 done = 1;
7436 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007437
Victor Stinner76a31a62011-11-04 00:05:13 +01007438 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007439 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007440 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007441 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007442 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007443
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007444 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007445 s, chunk_size);
7446 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007447 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007448 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007449 errors, final);
7450 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007451
7452 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007453 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007454 return NULL;
7455 }
7456
7457 if (consumed)
7458 *consumed += converted;
7459
7460 s += converted;
7461 size -= converted;
7462 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007463
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007464 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7465 PyMem_Free(buf);
7466 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467}
7468
Alexander Belopolsky40018472011-02-26 01:02:56 +00007469PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007470PyUnicode_DecodeCodePageStateful(int code_page,
7471 const char *s,
7472 Py_ssize_t size,
7473 const char *errors,
7474 Py_ssize_t *consumed)
7475{
7476 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7477}
7478
7479PyObject *
7480PyUnicode_DecodeMBCSStateful(const char *s,
7481 Py_ssize_t size,
7482 const char *errors,
7483 Py_ssize_t *consumed)
7484{
7485 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7486}
7487
7488PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007489PyUnicode_DecodeMBCS(const char *s,
7490 Py_ssize_t size,
7491 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007492{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007493 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7494}
7495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496static DWORD
7497encode_code_page_flags(UINT code_page, const char *errors)
7498{
7499 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007500 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 }
7502 else if (code_page == CP_UTF7) {
7503 /* CP_UTF7 only supports flags=0 */
7504 return 0;
7505 }
7506 else {
7507 if (errors != NULL && strcmp(errors, "replace") == 0)
7508 return 0;
7509 else
7510 return WC_NO_BEST_FIT_CHARS;
7511 }
7512}
7513
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 * Encode a Unicode string to a Windows code page into a byte string in strict
7516 * mode.
7517 *
7518 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007519 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007520 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007521static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007522encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007523 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007525{
Victor Stinner554f3f02010-06-16 23:33:54 +00007526 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 BOOL *pusedDefaultChar = &usedDefaultChar;
7528 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007529 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007530 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 const DWORD flags = encode_code_page_flags(code_page, NULL);
7532 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007533 /* Create a substring so that we can get the UTF-16 representation
7534 of just the slice under consideration. */
7535 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536
Martin v. Löwis3d325192011-11-04 18:23:06 +01007537 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007538
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007540 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007542 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007543
Victor Stinner2fc507f2011-11-04 20:06:39 +01007544 substring = PyUnicode_Substring(unicode, offset, offset+len);
7545 if (substring == NULL)
7546 return -1;
7547 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7548 if (p == NULL) {
7549 Py_DECREF(substring);
7550 return -1;
7551 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007552 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007553
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007554 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007556 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 NULL, 0,
7558 NULL, pusedDefaultChar);
7559 if (outsize <= 0)
7560 goto error;
7561 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007562 if (pusedDefaultChar && *pusedDefaultChar) {
7563 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007565 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007566
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007570 if (*outbytes == NULL) {
7571 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007573 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575 }
7576 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 const Py_ssize_t n = PyBytes_Size(*outbytes);
7579 if (outsize > PY_SSIZE_T_MAX - n) {
7580 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007584 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7585 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007587 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007589 }
7590
7591 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007592 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007593 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 out, outsize,
7595 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007596 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007597 if (outsize <= 0)
7598 goto error;
7599 if (pusedDefaultChar && *pusedDefaultChar)
7600 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007601 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007602
Victor Stinner3a50e702011-10-18 21:21:00 +02007603error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007604 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7606 return -2;
7607 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007608 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007609}
7610
Victor Stinner3a50e702011-10-18 21:21:00 +02007611/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007612 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 * error handler.
7614 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007615 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007616 * -1 on other error.
7617 */
7618static int
7619encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007620 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007621 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007622{
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007624 Py_ssize_t pos = unicode_offset;
7625 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 /* Ideally, we should get reason from FormatMessage. This is the Windows
7627 2000 English version of the message. */
7628 const char *reason = "invalid character";
7629 /* 4=maximum length of a UTF-8 sequence */
7630 char buffer[4];
7631 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7632 Py_ssize_t outsize;
7633 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 PyObject *errorHandler = NULL;
7635 PyObject *exc = NULL;
7636 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007637 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007638 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 PyObject *rep;
7640 int ret = -1;
7641
7642 assert(insize > 0);
7643
7644 encoding = code_page_name(code_page, &encoding_obj);
7645 if (encoding == NULL)
7646 return -1;
7647
7648 if (errors == NULL || strcmp(errors, "strict") == 0) {
7649 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7650 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007651 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 if (exc != NULL) {
7653 PyCodec_StrictErrors(exc);
7654 Py_DECREF(exc);
7655 }
7656 Py_XDECREF(encoding_obj);
7657 return -1;
7658 }
7659
7660 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7661 pusedDefaultChar = &usedDefaultChar;
7662 else
7663 pusedDefaultChar = NULL;
7664
7665 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7666 PyErr_NoMemory();
7667 goto error;
7668 }
7669 outsize = insize * Py_ARRAY_LENGTH(buffer);
7670
7671 if (*outbytes == NULL) {
7672 /* Create string object */
7673 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7674 if (*outbytes == NULL)
7675 goto error;
7676 out = PyBytes_AS_STRING(*outbytes);
7677 }
7678 else {
7679 /* Extend string object */
7680 Py_ssize_t n = PyBytes_Size(*outbytes);
7681 if (n > PY_SSIZE_T_MAX - outsize) {
7682 PyErr_NoMemory();
7683 goto error;
7684 }
7685 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7686 goto error;
7687 out = PyBytes_AS_STRING(*outbytes) + n;
7688 }
7689
7690 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007692 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007693 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7694 wchar_t chars[2];
7695 int charsize;
7696 if (ch < 0x10000) {
7697 chars[0] = (wchar_t)ch;
7698 charsize = 1;
7699 }
7700 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007701 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7702 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007703 charsize = 2;
7704 }
7705
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007707 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007708 buffer, Py_ARRAY_LENGTH(buffer),
7709 NULL, pusedDefaultChar);
7710 if (outsize > 0) {
7711 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7712 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007713 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 memcpy(out, buffer, outsize);
7715 out += outsize;
7716 continue;
7717 }
7718 }
7719 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7720 PyErr_SetFromWindowsErr(0);
7721 goto error;
7722 }
7723
Victor Stinner3a50e702011-10-18 21:21:00 +02007724 rep = unicode_encode_call_errorhandler(
7725 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007726 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007727 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007728 if (rep == NULL)
7729 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007730 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007731
7732 if (PyBytes_Check(rep)) {
7733 outsize = PyBytes_GET_SIZE(rep);
7734 if (outsize != 1) {
7735 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7736 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7737 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7738 Py_DECREF(rep);
7739 goto error;
7740 }
7741 out = PyBytes_AS_STRING(*outbytes) + offset;
7742 }
7743 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7744 out += outsize;
7745 }
7746 else {
7747 Py_ssize_t i;
7748 enum PyUnicode_Kind kind;
7749 void *data;
7750
Benjamin Petersonbac79492012-01-14 13:34:47 -05007751 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007752 Py_DECREF(rep);
7753 goto error;
7754 }
7755
7756 outsize = PyUnicode_GET_LENGTH(rep);
7757 if (outsize != 1) {
7758 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7759 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7760 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7761 Py_DECREF(rep);
7762 goto error;
7763 }
7764 out = PyBytes_AS_STRING(*outbytes) + offset;
7765 }
7766 kind = PyUnicode_KIND(rep);
7767 data = PyUnicode_DATA(rep);
7768 for (i=0; i < outsize; i++) {
7769 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7770 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007771 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 encoding, unicode,
7773 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007774 "unable to encode error handler result to ASCII");
7775 Py_DECREF(rep);
7776 goto error;
7777 }
7778 *out = (unsigned char)ch;
7779 out++;
7780 }
7781 }
7782 Py_DECREF(rep);
7783 }
7784 /* write a NUL byte */
7785 *out = 0;
7786 outsize = out - PyBytes_AS_STRING(*outbytes);
7787 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7788 if (_PyBytes_Resize(outbytes, outsize) < 0)
7789 goto error;
7790 ret = 0;
7791
7792error:
7793 Py_XDECREF(encoding_obj);
7794 Py_XDECREF(errorHandler);
7795 Py_XDECREF(exc);
7796 return ret;
7797}
7798
Victor Stinner3a50e702011-10-18 21:21:00 +02007799static PyObject *
7800encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007801 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007802 const char *errors)
7803{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007804 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007805 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007806 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007807 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007808
Victor Stinner29dacf22015-01-26 16:41:32 +01007809 if (!PyUnicode_Check(unicode)) {
7810 PyErr_BadArgument();
7811 return NULL;
7812 }
7813
Benjamin Petersonbac79492012-01-14 13:34:47 -05007814 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007815 return NULL;
7816 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007817
Victor Stinner3a50e702011-10-18 21:21:00 +02007818 if (code_page < 0) {
7819 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7820 return NULL;
7821 }
7822
Martin v. Löwis3d325192011-11-04 18:23:06 +01007823 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007824 return PyBytes_FromStringAndSize(NULL, 0);
7825
Victor Stinner7581cef2011-11-03 22:32:33 +01007826 offset = 0;
7827 do
7828 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007829#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007830 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007831 chunks. */
7832 if (len > INT_MAX/2) {
7833 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007834 done = 0;
7835 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007836 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007837#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007838 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007839 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007840 done = 1;
7841 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007842
Victor Stinner76a31a62011-11-04 00:05:13 +01007843 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007844 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007845 errors);
7846 if (ret == -2)
7847 ret = encode_code_page_errors(code_page, &outbytes,
7848 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007849 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007850 if (ret < 0) {
7851 Py_XDECREF(outbytes);
7852 return NULL;
7853 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007854
Victor Stinner7581cef2011-11-03 22:32:33 +01007855 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007856 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007857 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007858
Victor Stinner3a50e702011-10-18 21:21:00 +02007859 return outbytes;
7860}
7861
7862PyObject *
7863PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7864 Py_ssize_t size,
7865 const char *errors)
7866{
Victor Stinner7581cef2011-11-03 22:32:33 +01007867 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007868 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007869 if (unicode == NULL)
7870 return NULL;
7871 res = encode_code_page(CP_ACP, unicode, errors);
7872 Py_DECREF(unicode);
7873 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007874}
7875
7876PyObject *
7877PyUnicode_EncodeCodePage(int code_page,
7878 PyObject *unicode,
7879 const char *errors)
7880{
Victor Stinner7581cef2011-11-03 22:32:33 +01007881 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007882}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007883
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884PyObject *
7885PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007886{
Victor Stinner7581cef2011-11-03 22:32:33 +01007887 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007888}
7889
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007890#undef NEED_RETRY
7891
Steve Dowercc16be82016-09-08 10:35:16 -07007892#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007893
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894/* --- Character Mapping Codec -------------------------------------------- */
7895
Victor Stinnerfb161b12013-04-18 01:44:27 +02007896static int
7897charmap_decode_string(const char *s,
7898 Py_ssize_t size,
7899 PyObject *mapping,
7900 const char *errors,
7901 _PyUnicodeWriter *writer)
7902{
7903 const char *starts = s;
7904 const char *e;
7905 Py_ssize_t startinpos, endinpos;
7906 PyObject *errorHandler = NULL, *exc = NULL;
7907 Py_ssize_t maplen;
7908 enum PyUnicode_Kind mapkind;
7909 void *mapdata;
7910 Py_UCS4 x;
7911 unsigned char ch;
7912
7913 if (PyUnicode_READY(mapping) == -1)
7914 return -1;
7915
7916 maplen = PyUnicode_GET_LENGTH(mapping);
7917 mapdata = PyUnicode_DATA(mapping);
7918 mapkind = PyUnicode_KIND(mapping);
7919
7920 e = s + size;
7921
7922 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7923 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7924 * is disabled in encoding aliases, latin1 is preferred because
7925 * its implementation is faster. */
7926 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7927 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7928 Py_UCS4 maxchar = writer->maxchar;
7929
7930 assert (writer->kind == PyUnicode_1BYTE_KIND);
7931 while (s < e) {
7932 ch = *s;
7933 x = mapdata_ucs1[ch];
7934 if (x > maxchar) {
7935 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7936 goto onError;
7937 maxchar = writer->maxchar;
7938 outdata = (Py_UCS1 *)writer->data;
7939 }
7940 outdata[writer->pos] = x;
7941 writer->pos++;
7942 ++s;
7943 }
7944 return 0;
7945 }
7946
7947 while (s < e) {
7948 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7949 enum PyUnicode_Kind outkind = writer->kind;
7950 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7951 if (outkind == PyUnicode_1BYTE_KIND) {
7952 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7953 Py_UCS4 maxchar = writer->maxchar;
7954 while (s < e) {
7955 ch = *s;
7956 x = mapdata_ucs2[ch];
7957 if (x > maxchar)
7958 goto Error;
7959 outdata[writer->pos] = x;
7960 writer->pos++;
7961 ++s;
7962 }
7963 break;
7964 }
7965 else if (outkind == PyUnicode_2BYTE_KIND) {
7966 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7967 while (s < e) {
7968 ch = *s;
7969 x = mapdata_ucs2[ch];
7970 if (x == 0xFFFE)
7971 goto Error;
7972 outdata[writer->pos] = x;
7973 writer->pos++;
7974 ++s;
7975 }
7976 break;
7977 }
7978 }
7979 ch = *s;
7980
7981 if (ch < maplen)
7982 x = PyUnicode_READ(mapkind, mapdata, ch);
7983 else
7984 x = 0xfffe; /* invalid value */
7985Error:
7986 if (x == 0xfffe)
7987 {
7988 /* undefined mapping */
7989 startinpos = s-starts;
7990 endinpos = startinpos+1;
7991 if (unicode_decode_call_errorhandler_writer(
7992 errors, &errorHandler,
7993 "charmap", "character maps to <undefined>",
7994 &starts, &e, &startinpos, &endinpos, &exc, &s,
7995 writer)) {
7996 goto onError;
7997 }
7998 continue;
7999 }
8000
8001 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8002 goto onError;
8003 ++s;
8004 }
8005 Py_XDECREF(errorHandler);
8006 Py_XDECREF(exc);
8007 return 0;
8008
8009onError:
8010 Py_XDECREF(errorHandler);
8011 Py_XDECREF(exc);
8012 return -1;
8013}
8014
8015static int
8016charmap_decode_mapping(const char *s,
8017 Py_ssize_t size,
8018 PyObject *mapping,
8019 const char *errors,
8020 _PyUnicodeWriter *writer)
8021{
8022 const char *starts = s;
8023 const char *e;
8024 Py_ssize_t startinpos, endinpos;
8025 PyObject *errorHandler = NULL, *exc = NULL;
8026 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008027 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008028
8029 e = s + size;
8030
8031 while (s < e) {
8032 ch = *s;
8033
8034 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8035 key = PyLong_FromLong((long)ch);
8036 if (key == NULL)
8037 goto onError;
8038
8039 item = PyObject_GetItem(mapping, key);
8040 Py_DECREF(key);
8041 if (item == NULL) {
8042 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8043 /* No mapping found means: mapping is undefined. */
8044 PyErr_Clear();
8045 goto Undefined;
8046 } else
8047 goto onError;
8048 }
8049
8050 /* Apply mapping */
8051 if (item == Py_None)
8052 goto Undefined;
8053 if (PyLong_Check(item)) {
8054 long value = PyLong_AS_LONG(item);
8055 if (value == 0xFFFE)
8056 goto Undefined;
8057 if (value < 0 || value > MAX_UNICODE) {
8058 PyErr_Format(PyExc_TypeError,
8059 "character mapping must be in range(0x%lx)",
8060 (unsigned long)MAX_UNICODE + 1);
8061 goto onError;
8062 }
8063
8064 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8065 goto onError;
8066 }
8067 else if (PyUnicode_Check(item)) {
8068 if (PyUnicode_READY(item) == -1)
8069 goto onError;
8070 if (PyUnicode_GET_LENGTH(item) == 1) {
8071 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8072 if (value == 0xFFFE)
8073 goto Undefined;
8074 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8075 goto onError;
8076 }
8077 else {
8078 writer->overallocate = 1;
8079 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8080 goto onError;
8081 }
8082 }
8083 else {
8084 /* wrong return value */
8085 PyErr_SetString(PyExc_TypeError,
8086 "character mapping must return integer, None or str");
8087 goto onError;
8088 }
8089 Py_CLEAR(item);
8090 ++s;
8091 continue;
8092
8093Undefined:
8094 /* undefined mapping */
8095 Py_CLEAR(item);
8096 startinpos = s-starts;
8097 endinpos = startinpos+1;
8098 if (unicode_decode_call_errorhandler_writer(
8099 errors, &errorHandler,
8100 "charmap", "character maps to <undefined>",
8101 &starts, &e, &startinpos, &endinpos, &exc, &s,
8102 writer)) {
8103 goto onError;
8104 }
8105 }
8106 Py_XDECREF(errorHandler);
8107 Py_XDECREF(exc);
8108 return 0;
8109
8110onError:
8111 Py_XDECREF(item);
8112 Py_XDECREF(errorHandler);
8113 Py_XDECREF(exc);
8114 return -1;
8115}
8116
Alexander Belopolsky40018472011-02-26 01:02:56 +00008117PyObject *
8118PyUnicode_DecodeCharmap(const char *s,
8119 Py_ssize_t size,
8120 PyObject *mapping,
8121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008123 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008124
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125 /* Default to Latin-1 */
8126 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008130 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008131 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008132 writer.min_length = size;
8133 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008135
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008136 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008137 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8138 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008139 }
8140 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008141 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8142 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008144 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008145
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008147 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 return NULL;
8149}
8150
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008151/* Charmap encoding: the lookup table */
8152
Alexander Belopolsky40018472011-02-26 01:02:56 +00008153struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 PyObject_HEAD
8155 unsigned char level1[32];
8156 int count2, count3;
8157 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158};
8159
8160static PyObject*
8161encoding_map_size(PyObject *obj, PyObject* args)
8162{
8163 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166}
8167
8168static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 PyDoc_STR("Return the size (in bytes) of this object") },
8171 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172};
8173
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 "EncodingMap", /*tp_name*/
8177 sizeof(struct encoding_map), /*tp_basicsize*/
8178 0, /*tp_itemsize*/
8179 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008180 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008181 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 0, /*tp_getattr*/
8183 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008184 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 0, /*tp_repr*/
8186 0, /*tp_as_number*/
8187 0, /*tp_as_sequence*/
8188 0, /*tp_as_mapping*/
8189 0, /*tp_hash*/
8190 0, /*tp_call*/
8191 0, /*tp_str*/
8192 0, /*tp_getattro*/
8193 0, /*tp_setattro*/
8194 0, /*tp_as_buffer*/
8195 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8196 0, /*tp_doc*/
8197 0, /*tp_traverse*/
8198 0, /*tp_clear*/
8199 0, /*tp_richcompare*/
8200 0, /*tp_weaklistoffset*/
8201 0, /*tp_iter*/
8202 0, /*tp_iternext*/
8203 encoding_map_methods, /*tp_methods*/
8204 0, /*tp_members*/
8205 0, /*tp_getset*/
8206 0, /*tp_base*/
8207 0, /*tp_dict*/
8208 0, /*tp_descr_get*/
8209 0, /*tp_descr_set*/
8210 0, /*tp_dictoffset*/
8211 0, /*tp_init*/
8212 0, /*tp_alloc*/
8213 0, /*tp_new*/
8214 0, /*tp_free*/
8215 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008216};
8217
8218PyObject*
8219PyUnicode_BuildEncodingMap(PyObject* string)
8220{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 PyObject *result;
8222 struct encoding_map *mresult;
8223 int i;
8224 int need_dict = 0;
8225 unsigned char level1[32];
8226 unsigned char level2[512];
8227 unsigned char *mlevel1, *mlevel2, *mlevel3;
8228 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 int kind;
8230 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008231 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008233
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008234 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008235 PyErr_BadArgument();
8236 return NULL;
8237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 kind = PyUnicode_KIND(string);
8239 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008240 length = PyUnicode_GET_LENGTH(string);
8241 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242 memset(level1, 0xFF, sizeof level1);
8243 memset(level2, 0xFF, sizeof level2);
8244
8245 /* If there isn't a one-to-one mapping of NULL to \0,
8246 or if there are non-BMP characters, we need to use
8247 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008249 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008250 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008251 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 ch = PyUnicode_READ(kind, data, i);
8253 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008254 need_dict = 1;
8255 break;
8256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258 /* unmapped character */
8259 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 l1 = ch >> 11;
8261 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262 if (level1[l1] == 0xFF)
8263 level1[l1] = count2++;
8264 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008265 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 }
8267
8268 if (count2 >= 0xFF || count3 >= 0xFF)
8269 need_dict = 1;
8270
8271 if (need_dict) {
8272 PyObject *result = PyDict_New();
8273 PyObject *key, *value;
8274 if (!result)
8275 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008276 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008278 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 if (!key || !value)
8280 goto failed1;
8281 if (PyDict_SetItem(result, key, value) == -1)
8282 goto failed1;
8283 Py_DECREF(key);
8284 Py_DECREF(value);
8285 }
8286 return result;
8287 failed1:
8288 Py_XDECREF(key);
8289 Py_XDECREF(value);
8290 Py_DECREF(result);
8291 return NULL;
8292 }
8293
8294 /* Create a three-level trie */
8295 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8296 16*count2 + 128*count3 - 1);
8297 if (!result)
8298 return PyErr_NoMemory();
8299 PyObject_Init(result, &EncodingMapType);
8300 mresult = (struct encoding_map*)result;
8301 mresult->count2 = count2;
8302 mresult->count3 = count3;
8303 mlevel1 = mresult->level1;
8304 mlevel2 = mresult->level23;
8305 mlevel3 = mresult->level23 + 16*count2;
8306 memcpy(mlevel1, level1, 32);
8307 memset(mlevel2, 0xFF, 16*count2);
8308 memset(mlevel3, 0, 128*count3);
8309 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008310 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008311 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008312 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8313 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314 /* unmapped character */
8315 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008316 o1 = ch>>11;
8317 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 i2 = 16*mlevel1[o1] + o2;
8319 if (mlevel2[i2] == 0xFF)
8320 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008321 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 i3 = 128*mlevel2[i2] + o3;
8323 mlevel3[i3] = i;
8324 }
8325 return result;
8326}
8327
8328static int
Victor Stinner22168992011-11-20 17:09:18 +01008329encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330{
8331 struct encoding_map *map = (struct encoding_map*)mapping;
8332 int l1 = c>>11;
8333 int l2 = (c>>7) & 0xF;
8334 int l3 = c & 0x7F;
8335 int i;
8336
Victor Stinner22168992011-11-20 17:09:18 +01008337 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008339 if (c == 0)
8340 return 0;
8341 /* level 1*/
8342 i = map->level1[l1];
8343 if (i == 0xFF) {
8344 return -1;
8345 }
8346 /* level 2*/
8347 i = map->level23[16*i+l2];
8348 if (i == 0xFF) {
8349 return -1;
8350 }
8351 /* level 3 */
8352 i = map->level23[16*map->count2 + 128*i + l3];
8353 if (i == 0) {
8354 return -1;
8355 }
8356 return i;
8357}
8358
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359/* Lookup the character ch in the mapping. If the character
8360 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008361 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008362static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008363charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Christian Heimes217cfd12007-12-02 14:31:20 +00008365 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 PyObject *x;
8367
8368 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 x = PyObject_GetItem(mapping, w);
8371 Py_DECREF(w);
8372 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8374 /* No mapping found means: mapping is undefined. */
8375 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008376 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 } else
8378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008380 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008382 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 long value = PyLong_AS_LONG(x);
8384 if (value < 0 || value > 255) {
8385 PyErr_SetString(PyExc_TypeError,
8386 "character mapping must be in range(256)");
8387 Py_DECREF(x);
8388 return NULL;
8389 }
8390 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008392 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 /* wrong return value */
8396 PyErr_Format(PyExc_TypeError,
8397 "character mapping must return integer, bytes or None, not %.400s",
8398 x->ob_type->tp_name);
8399 Py_DECREF(x);
8400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 }
8402}
8403
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008404static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008405charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008406{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008407 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8408 /* exponentially overallocate to minimize reallocations */
8409 if (requiredsize < 2*outsize)
8410 requiredsize = 2*outsize;
8411 if (_PyBytes_Resize(outobj, requiredsize))
8412 return -1;
8413 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008414}
8415
Benjamin Peterson14339b62009-01-31 16:36:08 +00008416typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008418} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008420 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 space is available. Return a new reference to the object that
8422 was put in the output buffer, or Py_None, if the mapping was undefined
8423 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008424 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008426charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008427 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008429 PyObject *rep;
8430 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008431 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432
Christian Heimes90aa7642007-12-19 02:45:37 +00008433 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008436 if (res == -1)
8437 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 if (outsize<requiredsize)
8439 if (charmapencode_resize(outobj, outpos, requiredsize))
8440 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008441 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 outstart[(*outpos)++] = (char)res;
8443 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008444 }
8445
8446 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008449 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 Py_DECREF(rep);
8451 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008452 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 if (PyLong_Check(rep)) {
8454 Py_ssize_t requiredsize = *outpos+1;
8455 if (outsize<requiredsize)
8456 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8457 Py_DECREF(rep);
8458 return enc_EXCEPTION;
8459 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008460 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 else {
8464 const char *repchars = PyBytes_AS_STRING(rep);
8465 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8466 Py_ssize_t requiredsize = *outpos+repsize;
8467 if (outsize<requiredsize)
8468 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8469 Py_DECREF(rep);
8470 return enc_EXCEPTION;
8471 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008472 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 memcpy(outstart + *outpos, repchars, repsize);
8474 *outpos += repsize;
8475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008477 Py_DECREF(rep);
8478 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479}
8480
8481/* handle an error in PyUnicode_EncodeCharmap
8482 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008483static int
8484charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008485 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008487 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008488 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489{
8490 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008492 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008493 enum PyUnicode_Kind kind;
8494 void *data;
8495 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008497 Py_ssize_t collstartpos = *inpos;
8498 Py_ssize_t collendpos = *inpos+1;
8499 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008500 const char *encoding = "charmap";
8501 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008502 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008503 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008504 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505
Benjamin Petersonbac79492012-01-14 13:34:47 -05008506 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507 return -1;
8508 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 /* find all unencodable characters */
8510 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008512 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008514 val = encoding_map_lookup(ch, mapping);
8515 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 break;
8517 ++collendpos;
8518 continue;
8519 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008521 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8522 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 if (rep==NULL)
8524 return -1;
8525 else if (rep!=Py_None) {
8526 Py_DECREF(rep);
8527 break;
8528 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008529 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
8532 /* cache callback name lookup
8533 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008534 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008535 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008536
8537 switch (*error_handler) {
8538 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008539 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008540 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008541
8542 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008543 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 x = charmapencode_output('?', mapping, res, respos);
8545 if (x==enc_EXCEPTION) {
8546 return -1;
8547 }
8548 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008549 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 return -1;
8551 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008552 }
8553 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008554 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008555 *inpos = collendpos;
8556 break;
Victor Stinner50149202015-09-22 00:26:54 +02008557
8558 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 /* generate replacement (temporarily (mis)uses p) */
8560 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 char buffer[2+29+1+1];
8562 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008563 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 for (cp = buffer; *cp; ++cp) {
8565 x = charmapencode_output(*cp, mapping, res, respos);
8566 if (x==enc_EXCEPTION)
8567 return -1;
8568 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008569 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 return -1;
8571 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008572 }
8573 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 *inpos = collendpos;
8575 break;
Victor Stinner50149202015-09-22 00:26:54 +02008576
Benjamin Peterson14339b62009-01-31 16:36:08 +00008577 default:
Victor Stinner50149202015-09-22 00:26:54 +02008578 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008579 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008583 if (PyBytes_Check(repunicode)) {
8584 /* Directly copy bytes result to output. */
8585 Py_ssize_t outsize = PyBytes_Size(*res);
8586 Py_ssize_t requiredsize;
8587 repsize = PyBytes_Size(repunicode);
8588 requiredsize = *respos + repsize;
8589 if (requiredsize > outsize)
8590 /* Make room for all additional bytes. */
8591 if (charmapencode_resize(res, respos, requiredsize)) {
8592 Py_DECREF(repunicode);
8593 return -1;
8594 }
8595 memcpy(PyBytes_AsString(*res) + *respos,
8596 PyBytes_AsString(repunicode), repsize);
8597 *respos += repsize;
8598 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008599 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008600 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008602 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008603 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008604 Py_DECREF(repunicode);
8605 return -1;
8606 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008607 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008608 data = PyUnicode_DATA(repunicode);
8609 kind = PyUnicode_KIND(repunicode);
8610 for (index = 0; index < repsize; index++) {
8611 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8612 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008614 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 return -1;
8616 }
8617 else if (x==enc_FAILED) {
8618 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008619 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 return -1;
8621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008622 }
8623 *inpos = newpos;
8624 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 }
8626 return 0;
8627}
8628
Alexander Belopolsky40018472011-02-26 01:02:56 +00008629PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008630_PyUnicode_EncodeCharmap(PyObject *unicode,
8631 PyObject *mapping,
8632 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008634 /* output object */
8635 PyObject *res = NULL;
8636 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008637 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008638 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008640 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008641 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008643 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008644 void *data;
8645 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646
Benjamin Petersonbac79492012-01-14 13:34:47 -05008647 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008648 return NULL;
8649 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008650 data = PyUnicode_DATA(unicode);
8651 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008652
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 /* Default to Latin-1 */
8654 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008655 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 /* allocate enough for a simple encoding without
8658 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008659 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008660 if (res == NULL)
8661 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008662 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008666 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008668 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 if (x==enc_EXCEPTION) /* error */
8670 goto onError;
8671 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008672 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008674 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 &res, &respos)) {
8676 goto onError;
8677 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008678 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 else
8680 /* done with this character => adjust input position */
8681 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008685 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008686 if (_PyBytes_Resize(&res, respos) < 0)
8687 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008689 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008690 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 return res;
8692
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 Py_XDECREF(res);
8695 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008696 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 return NULL;
8698}
8699
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700/* Deprecated */
8701PyObject *
8702PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8703 Py_ssize_t size,
8704 PyObject *mapping,
8705 const char *errors)
8706{
8707 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008708 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008709 if (unicode == NULL)
8710 return NULL;
8711 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8712 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008713 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008714}
8715
Alexander Belopolsky40018472011-02-26 01:02:56 +00008716PyObject *
8717PyUnicode_AsCharmapString(PyObject *unicode,
8718 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719{
8720 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 PyErr_BadArgument();
8722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008724 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725}
8726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008728static void
8729make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008731 Py_ssize_t startpos, Py_ssize_t endpos,
8732 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008734 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 *exceptionObject = _PyUnicodeTranslateError_Create(
8736 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 }
8738 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8740 goto onError;
8741 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8742 goto onError;
8743 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8744 goto onError;
8745 return;
8746 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008747 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 }
8749}
8750
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751/* error handling callback helper:
8752 build arguments, call the callback and check the arguments,
8753 put the result into newpos and return the replacement string, which
8754 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008755static PyObject *
8756unicode_translate_call_errorhandler(const char *errors,
8757 PyObject **errorHandler,
8758 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008760 Py_ssize_t startpos, Py_ssize_t endpos,
8761 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008763 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008765 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766 PyObject *restuple;
8767 PyObject *resunicode;
8768
8769 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008771 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773 }
8774
8775 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008780 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008784 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 Py_DECREF(restuple);
8786 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008788 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 &resunicode, &i_newpos)) {
8790 Py_DECREF(restuple);
8791 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008793 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 else
8796 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008798 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 Py_DECREF(restuple);
8800 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 Py_INCREF(resunicode);
8803 Py_DECREF(restuple);
8804 return resunicode;
8805}
8806
8807/* Lookup the character ch in the mapping and put the result in result,
8808 which must be decrefed by the caller.
8809 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008810static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812{
Christian Heimes217cfd12007-12-02 14:31:20 +00008813 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814 PyObject *x;
8815
8816 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818 x = PyObject_GetItem(mapping, w);
8819 Py_DECREF(w);
8820 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8822 /* No mapping found means: use 1:1 mapping. */
8823 PyErr_Clear();
8824 *result = NULL;
8825 return 0;
8826 } else
8827 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 }
8829 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 *result = x;
8831 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008832 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008833 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008835 if (value < 0 || value > MAX_UNICODE) {
8836 PyErr_Format(PyExc_ValueError,
8837 "character mapping must be in range(0x%x)",
8838 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 Py_DECREF(x);
8840 return -1;
8841 }
8842 *result = x;
8843 return 0;
8844 }
8845 else if (PyUnicode_Check(x)) {
8846 *result = x;
8847 return 0;
8848 }
8849 else {
8850 /* wrong return value */
8851 PyErr_SetString(PyExc_TypeError,
8852 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008853 Py_DECREF(x);
8854 return -1;
8855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008856}
Victor Stinner1194ea02014-04-04 19:37:40 +02008857
8858/* lookup the character, write the result into the writer.
8859 Return 1 if the result was written into the writer, return 0 if the mapping
8860 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008861static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008862charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8863 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864{
Victor Stinner1194ea02014-04-04 19:37:40 +02008865 PyObject *item;
8866
8867 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008869
8870 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008872 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008875 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008876 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008877
8878 if (item == Py_None) {
8879 Py_DECREF(item);
8880 return 0;
8881 }
8882
8883 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008884 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8885 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8886 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008887 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8888 Py_DECREF(item);
8889 return -1;
8890 }
8891 Py_DECREF(item);
8892 return 1;
8893 }
8894
8895 if (!PyUnicode_Check(item)) {
8896 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008898 }
8899
8900 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8901 Py_DECREF(item);
8902 return -1;
8903 }
8904
8905 Py_DECREF(item);
8906 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008907}
8908
Victor Stinner89a76ab2014-04-05 11:44:04 +02008909static int
8910unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8911 Py_UCS1 *translate)
8912{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008913 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 int ret = 0;
8915
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916 if (charmaptranslate_lookup(ch, mapping, &item)) {
8917 return -1;
8918 }
8919
8920 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008921 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008922 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008923 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008924 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008925 /* not found => default to 1:1 mapping */
8926 translate[ch] = ch;
8927 return 1;
8928 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008929 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008930 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008931 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8932 used it */
8933 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 /* invalid character or character outside ASCII:
8935 skip the fast translate */
8936 goto exit;
8937 }
8938 translate[ch] = (Py_UCS1)replace;
8939 }
8940 else if (PyUnicode_Check(item)) {
8941 Py_UCS4 replace;
8942
8943 if (PyUnicode_READY(item) == -1) {
8944 Py_DECREF(item);
8945 return -1;
8946 }
8947 if (PyUnicode_GET_LENGTH(item) != 1)
8948 goto exit;
8949
8950 replace = PyUnicode_READ_CHAR(item, 0);
8951 if (replace > 127)
8952 goto exit;
8953 translate[ch] = (Py_UCS1)replace;
8954 }
8955 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008956 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008957 goto exit;
8958 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008959 ret = 1;
8960
Benjamin Peterson1365de72014-04-07 20:15:41 -04008961 exit:
8962 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008963 return ret;
8964}
8965
8966/* Fast path for ascii => ascii translation. Return 1 if the whole string
8967 was translated into writer, return 0 if the input string was partially
8968 translated into writer, raise an exception and return -1 on error. */
8969static int
8970unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008971 _PyUnicodeWriter *writer, int ignore,
8972 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008973{
Victor Stinner872b2912014-04-05 14:27:07 +02008974 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975 Py_ssize_t len;
8976 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008977 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979 len = PyUnicode_GET_LENGTH(input);
8980
Victor Stinner872b2912014-04-05 14:27:07 +02008981 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982
8983 in = PyUnicode_1BYTE_DATA(input);
8984 end = in + len;
8985
8986 assert(PyUnicode_IS_ASCII(writer->buffer));
8987 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8988 out = PyUnicode_1BYTE_DATA(writer->buffer);
8989
Victor Stinner872b2912014-04-05 14:27:07 +02008990 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008991 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008992 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008993 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008994 int translate = unicode_fast_translate_lookup(mapping, ch,
8995 ascii_table);
8996 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008998 if (translate == 0)
8999 goto exit;
9000 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009001 }
Victor Stinner872b2912014-04-05 14:27:07 +02009002 if (ch2 == 0xfe) {
9003 if (ignore)
9004 continue;
9005 goto exit;
9006 }
9007 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009009 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009010 }
Victor Stinner872b2912014-04-05 14:27:07 +02009011 res = 1;
9012
9013exit:
9014 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009015 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009016 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009017}
9018
Victor Stinner3222da22015-10-01 22:07:32 +02009019static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020_PyUnicode_TranslateCharmap(PyObject *input,
9021 PyObject *mapping,
9022 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009025 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 Py_ssize_t size, i;
9027 int kind;
9028 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009029 _PyUnicodeWriter writer;
9030 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009031 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032 PyObject *errorHandler = NULL;
9033 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009035 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009036
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 PyErr_BadArgument();
9039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 if (PyUnicode_READY(input) == -1)
9043 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009044 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 kind = PyUnicode_KIND(input);
9046 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009048 if (size == 0)
9049 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009051 /* allocate enough for a simple 1:1 translation without
9052 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009053 _PyUnicodeWriter_Init(&writer);
9054 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009055 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056
Victor Stinner872b2912014-04-05 14:27:07 +02009057 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9058
Victor Stinner33798672016-03-01 21:59:58 +01009059 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009060 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009061 if (PyUnicode_IS_ASCII(input)) {
9062 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9063 if (res < 0) {
9064 _PyUnicodeWriter_Dealloc(&writer);
9065 return NULL;
9066 }
9067 if (res == 1)
9068 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009069 }
Victor Stinner33798672016-03-01 21:59:58 +01009070 else {
9071 i = 0;
9072 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009076 int translate;
9077 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9078 Py_ssize_t newpos;
9079 /* startpos for collecting untranslatable chars */
9080 Py_ssize_t collstart;
9081 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009082 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083
Victor Stinner1194ea02014-04-04 19:37:40 +02009084 ch = PyUnicode_READ(kind, data, i);
9085 translate = charmaptranslate_output(ch, mapping, &writer);
9086 if (translate < 0)
9087 goto onError;
9088
9089 if (translate != 0) {
9090 /* it worked => adjust input pointer */
9091 ++i;
9092 continue;
9093 }
9094
9095 /* untranslatable character */
9096 collstart = i;
9097 collend = i+1;
9098
9099 /* find all untranslatable characters */
9100 while (collend < size) {
9101 PyObject *x;
9102 ch = PyUnicode_READ(kind, data, collend);
9103 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009104 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009105 Py_XDECREF(x);
9106 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009108 ++collend;
9109 }
9110
9111 if (ignore) {
9112 i = collend;
9113 }
9114 else {
9115 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9116 reason, input, &exc,
9117 collstart, collend, &newpos);
9118 if (repunicode == NULL)
9119 goto onError;
9120 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009121 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009122 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009123 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009124 Py_DECREF(repunicode);
9125 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009126 }
9127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009128 Py_XDECREF(exc);
9129 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009130 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009133 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009134 Py_XDECREF(exc);
9135 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 return NULL;
9137}
9138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139/* Deprecated. Use PyUnicode_Translate instead. */
9140PyObject *
9141PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9142 Py_ssize_t size,
9143 PyObject *mapping,
9144 const char *errors)
9145{
Christian Heimes5f520f42012-09-11 14:03:25 +02009146 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009147 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 if (!unicode)
9149 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009150 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9151 Py_DECREF(unicode);
9152 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153}
9154
Alexander Belopolsky40018472011-02-26 01:02:56 +00009155PyObject *
9156PyUnicode_Translate(PyObject *str,
9157 PyObject *mapping,
9158 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009160 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009161 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009162 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163}
Tim Petersced69f82003-09-16 20:30:58 +00009164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165PyObject *
9166_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9167{
9168 if (!PyUnicode_Check(unicode)) {
9169 PyErr_BadInternalCall();
9170 return NULL;
9171 }
9172 if (PyUnicode_READY(unicode) == -1)
9173 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009174 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 /* If the string is already ASCII, just return the same string */
9176 Py_INCREF(unicode);
9177 return unicode;
9178 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009179
9180 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9181 PyObject *result = PyUnicode_New(len, 127);
9182 if (result == NULL) {
9183 return NULL;
9184 }
9185
9186 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9187 int kind = PyUnicode_KIND(unicode);
9188 const void *data = PyUnicode_DATA(unicode);
9189 Py_ssize_t i;
9190 for (i = 0; i < len; ++i) {
9191 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9192 if (ch < 127) {
9193 out[i] = ch;
9194 }
9195 else if (Py_UNICODE_ISSPACE(ch)) {
9196 out[i] = ' ';
9197 }
9198 else {
9199 int decimal = Py_UNICODE_TODECIMAL(ch);
9200 if (decimal < 0) {
9201 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009202 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009203 _PyUnicode_LENGTH(result) = i + 1;
9204 break;
9205 }
9206 out[i] = '0' + decimal;
9207 }
9208 }
9209
INADA Naoki16dfca42018-07-14 12:06:43 +09009210 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009211 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212}
9213
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009214PyObject *
9215PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9216 Py_ssize_t length)
9217{
Victor Stinnerf0124502011-11-21 23:12:56 +01009218 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009219 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009220 Py_UCS4 maxchar;
9221 enum PyUnicode_Kind kind;
9222 void *data;
9223
Victor Stinner99d7ad02012-02-22 13:37:39 +01009224 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009225 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009226 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009227 if (ch > 127) {
9228 int decimal = Py_UNICODE_TODECIMAL(ch);
9229 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009230 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009231 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009232 }
9233 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009234
9235 /* Copy to a new string */
9236 decimal = PyUnicode_New(length, maxchar);
9237 if (decimal == NULL)
9238 return decimal;
9239 kind = PyUnicode_KIND(decimal);
9240 data = PyUnicode_DATA(decimal);
9241 /* Iterate over code points */
9242 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009243 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009244 if (ch > 127) {
9245 int decimal = Py_UNICODE_TODECIMAL(ch);
9246 if (decimal >= 0)
9247 ch = '0' + decimal;
9248 }
9249 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009251 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009252}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009253/* --- Decimal Encoder ---------------------------------------------------- */
9254
Alexander Belopolsky40018472011-02-26 01:02:56 +00009255int
9256PyUnicode_EncodeDecimal(Py_UNICODE *s,
9257 Py_ssize_t length,
9258 char *output,
9259 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009260{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009261 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009262 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009263 enum PyUnicode_Kind kind;
9264 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009265
9266 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009267 PyErr_BadArgument();
9268 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009269 }
9270
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009271 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009272 if (unicode == NULL)
9273 return -1;
9274
Victor Stinner42bf7752011-11-21 22:52:58 +01009275 kind = PyUnicode_KIND(unicode);
9276 data = PyUnicode_DATA(unicode);
9277
Victor Stinnerb84d7232011-11-22 01:50:07 +01009278 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009279 PyObject *exc;
9280 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009282 Py_ssize_t startpos;
9283
9284 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009285
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009287 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009288 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009289 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009290 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 decimal = Py_UNICODE_TODECIMAL(ch);
9292 if (decimal >= 0) {
9293 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009294 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 continue;
9296 }
9297 if (0 < ch && ch < 256) {
9298 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009299 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 continue;
9301 }
Victor Stinner6345be92011-11-25 20:09:01 +01009302
Victor Stinner42bf7752011-11-21 22:52:58 +01009303 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009304 exc = NULL;
9305 raise_encode_exception(&exc, "decimal", unicode,
9306 startpos, startpos+1,
9307 "invalid decimal Unicode string");
9308 Py_XDECREF(exc);
9309 Py_DECREF(unicode);
9310 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009311 }
9312 /* 0-terminate the output string */
9313 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009314 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009315 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009316}
9317
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318/* --- Helpers ------------------------------------------------------------ */
9319
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009320/* helper macro to fixup start/end slice values */
9321#define ADJUST_INDICES(start, end, len) \
9322 if (end > len) \
9323 end = len; \
9324 else if (end < 0) { \
9325 end += len; \
9326 if (end < 0) \
9327 end = 0; \
9328 } \
9329 if (start < 0) { \
9330 start += len; \
9331 if (start < 0) \
9332 start = 0; \
9333 }
9334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009335static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009336any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009338 Py_ssize_t end,
9339 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009341 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 void *buf1, *buf2;
9343 Py_ssize_t len1, len2, result;
9344
9345 kind1 = PyUnicode_KIND(s1);
9346 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009347 if (kind1 < kind2)
9348 return -1;
9349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 len1 = PyUnicode_GET_LENGTH(s1);
9351 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009352 ADJUST_INDICES(start, end, len1);
9353 if (end - start < len2)
9354 return -1;
9355
9356 buf1 = PyUnicode_DATA(s1);
9357 buf2 = PyUnicode_DATA(s2);
9358 if (len2 == 1) {
9359 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9360 result = findchar((const char *)buf1 + kind1*start,
9361 kind1, end - start, ch, direction);
9362 if (result == -1)
9363 return -1;
9364 else
9365 return start + result;
9366 }
9367
9368 if (kind2 != kind1) {
9369 buf2 = _PyUnicode_AsKind(s2, kind1);
9370 if (!buf2)
9371 return -2;
9372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373
Victor Stinner794d5672011-10-10 03:21:36 +02009374 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009375 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009376 case PyUnicode_1BYTE_KIND:
9377 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9378 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9379 else
9380 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9381 break;
9382 case PyUnicode_2BYTE_KIND:
9383 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9384 break;
9385 case PyUnicode_4BYTE_KIND:
9386 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9387 break;
9388 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009389 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009390 }
9391 }
9392 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009393 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009394 case PyUnicode_1BYTE_KIND:
9395 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9396 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9397 else
9398 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9399 break;
9400 case PyUnicode_2BYTE_KIND:
9401 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9402 break;
9403 case PyUnicode_4BYTE_KIND:
9404 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9405 break;
9406 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009407 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 }
9410
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009411 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 PyMem_Free(buf2);
9413
9414 return result;
9415}
9416
Victor Stinner59423e32018-11-26 13:40:01 +01009417/* _PyUnicode_InsertThousandsGrouping() helper functions */
9418#include "stringlib/localeutil.h"
9419
9420/**
9421 * InsertThousandsGrouping:
9422 * @writer: Unicode writer.
9423 * @n_buffer: Number of characters in @buffer.
9424 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9425 * @d_pos: Start of digits string.
9426 * @n_digits: The number of digits in the string, in which we want
9427 * to put the grouping chars.
9428 * @min_width: The minimum width of the digits in the output string.
9429 * Output will be zero-padded on the left to fill.
9430 * @grouping: see definition in localeconv().
9431 * @thousands_sep: see definition in localeconv().
9432 *
9433 * There are 2 modes: counting and filling. If @writer is NULL,
9434 * we are in counting mode, else filling mode.
9435 * If counting, the required buffer size is returned.
9436 * If filling, we know the buffer will be large enough, so we don't
9437 * need to pass in the buffer size.
9438 * Inserts thousand grouping characters (as defined by grouping and
9439 * thousands_sep) into @writer.
9440 *
9441 * Return value: -1 on error, number of characters otherwise.
9442 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009444_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009445 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009446 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009447 PyObject *digits,
9448 Py_ssize_t d_pos,
9449 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009450 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009451 const char *grouping,
9452 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009453 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454{
Xtreak3f7983a2019-01-07 20:39:14 +05309455 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009456 if (writer) {
9457 assert(digits != NULL);
9458 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009459 }
9460 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009461 assert(digits == NULL);
9462 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009463 }
Victor Stinner59423e32018-11-26 13:40:01 +01009464 assert(0 <= d_pos);
9465 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009466 assert(grouping != NULL);
9467
9468 if (digits != NULL) {
9469 if (PyUnicode_READY(digits) == -1) {
9470 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009471 }
Victor Stinner59423e32018-11-26 13:40:01 +01009472 }
9473 if (PyUnicode_READY(thousands_sep) == -1) {
9474 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009475 }
9476
Victor Stinner59423e32018-11-26 13:40:01 +01009477 Py_ssize_t count = 0;
9478 Py_ssize_t n_zeros;
9479 int loop_broken = 0;
9480 int use_separator = 0; /* First time through, don't append the
9481 separator. They only go between
9482 groups. */
9483 Py_ssize_t buffer_pos;
9484 Py_ssize_t digits_pos;
9485 Py_ssize_t len;
9486 Py_ssize_t n_chars;
9487 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9488 be looked at */
9489 /* A generator that returns all of the grouping widths, until it
9490 returns 0. */
9491 GroupGenerator groupgen;
9492 GroupGenerator_init(&groupgen, grouping);
9493 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9494
9495 /* if digits are not grouped, thousands separator
9496 should be an empty string */
9497 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9498
9499 digits_pos = d_pos + n_digits;
9500 if (writer) {
9501 buffer_pos = writer->pos + n_buffer;
9502 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9503 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 }
Victor Stinner59423e32018-11-26 13:40:01 +01009505 else {
9506 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009507 }
Victor Stinner59423e32018-11-26 13:40:01 +01009508
9509 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009510 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009511 }
Victor Stinner59423e32018-11-26 13:40:01 +01009512
9513 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9514 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9515 n_zeros = Py_MAX(0, len - remaining);
9516 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9517
9518 /* Use n_zero zero's and n_chars chars */
9519
9520 /* Count only, don't do anything. */
9521 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9522
9523 /* Copy into the writer. */
9524 InsertThousandsGrouping_fill(writer, &buffer_pos,
9525 digits, &digits_pos,
9526 n_chars, n_zeros,
9527 use_separator ? thousands_sep : NULL,
9528 thousands_sep_len, maxchar);
9529
9530 /* Use a separator next time. */
9531 use_separator = 1;
9532
9533 remaining -= n_chars;
9534 min_width -= len;
9535
9536 if (remaining <= 0 && min_width <= 0) {
9537 loop_broken = 1;
9538 break;
9539 }
9540 min_width -= thousands_sep_len;
9541 }
9542 if (!loop_broken) {
9543 /* We left the loop without using a break statement. */
9544
9545 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9546 n_zeros = Py_MAX(0, len - remaining);
9547 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9548
9549 /* Use n_zero zero's and n_chars chars */
9550 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9551
9552 /* Copy into the writer. */
9553 InsertThousandsGrouping_fill(writer, &buffer_pos,
9554 digits, &digits_pos,
9555 n_chars, n_zeros,
9556 use_separator ? thousands_sep : NULL,
9557 thousands_sep_len, maxchar);
9558 }
9559 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560}
9561
9562
Alexander Belopolsky40018472011-02-26 01:02:56 +00009563Py_ssize_t
9564PyUnicode_Count(PyObject *str,
9565 PyObject *substr,
9566 Py_ssize_t start,
9567 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009569 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009570 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 void *buf1 = NULL, *buf2 = NULL;
9572 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009573
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009574 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009575 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009576
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009577 kind1 = PyUnicode_KIND(str);
9578 kind2 = PyUnicode_KIND(substr);
9579 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009580 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009581
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009582 len1 = PyUnicode_GET_LENGTH(str);
9583 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009586 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 buf1 = PyUnicode_DATA(str);
9589 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009590 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009591 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592 if (!buf2)
9593 goto onError;
9594 }
9595
9596 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009598 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009599 result = asciilib_count(
9600 ((Py_UCS1*)buf1) + start, end - start,
9601 buf2, len2, PY_SSIZE_T_MAX
9602 );
9603 else
9604 result = ucs1lib_count(
9605 ((Py_UCS1*)buf1) + start, end - start,
9606 buf2, len2, PY_SSIZE_T_MAX
9607 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 break;
9609 case PyUnicode_2BYTE_KIND:
9610 result = ucs2lib_count(
9611 ((Py_UCS2*)buf1) + start, end - start,
9612 buf2, len2, PY_SSIZE_T_MAX
9613 );
9614 break;
9615 case PyUnicode_4BYTE_KIND:
9616 result = ucs4lib_count(
9617 ((Py_UCS4*)buf1) + start, end - start,
9618 buf2, len2, PY_SSIZE_T_MAX
9619 );
9620 break;
9621 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009622 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009624
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009625 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 PyMem_Free(buf2);
9627
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009630 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 PyMem_Free(buf2);
9632 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633}
9634
Alexander Belopolsky40018472011-02-26 01:02:56 +00009635Py_ssize_t
9636PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009637 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009638 Py_ssize_t start,
9639 Py_ssize_t end,
9640 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009642 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009644
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009645 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646}
9647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648Py_ssize_t
9649PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9650 Py_ssize_t start, Py_ssize_t end,
9651 int direction)
9652{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009654 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 if (PyUnicode_READY(str) == -1)
9656 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009657 len = PyUnicode_GET_LENGTH(str);
9658 ADJUST_INDICES(start, end, len);
9659 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009660 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009662 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9663 kind, end-start, ch, direction);
9664 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009666 else
9667 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668}
9669
Alexander Belopolsky40018472011-02-26 01:02:56 +00009670static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009671tailmatch(PyObject *self,
9672 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009673 Py_ssize_t start,
9674 Py_ssize_t end,
9675 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 int kind_self;
9678 int kind_sub;
9679 void *data_self;
9680 void *data_sub;
9681 Py_ssize_t offset;
9682 Py_ssize_t i;
9683 Py_ssize_t end_sub;
9684
9685 if (PyUnicode_READY(self) == -1 ||
9686 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009687 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9690 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009692 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009694 if (PyUnicode_GET_LENGTH(substring) == 0)
9695 return 1;
9696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 kind_self = PyUnicode_KIND(self);
9698 data_self = PyUnicode_DATA(self);
9699 kind_sub = PyUnicode_KIND(substring);
9700 data_sub = PyUnicode_DATA(substring);
9701 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9702
9703 if (direction > 0)
9704 offset = end;
9705 else
9706 offset = start;
9707
9708 if (PyUnicode_READ(kind_self, data_self, offset) ==
9709 PyUnicode_READ(kind_sub, data_sub, 0) &&
9710 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9711 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9712 /* If both are of the same kind, memcmp is sufficient */
9713 if (kind_self == kind_sub) {
9714 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009715 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 data_sub,
9717 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009718 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009719 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009720 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 else {
9722 /* We do not need to compare 0 and len(substring)-1 because
9723 the if statement above ensured already that they are equal
9724 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 for (i = 1; i < end_sub; ++i) {
9726 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9727 PyUnicode_READ(kind_sub, data_sub, i))
9728 return 0;
9729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009730 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732 }
9733
9734 return 0;
9735}
9736
Alexander Belopolsky40018472011-02-26 01:02:56 +00009737Py_ssize_t
9738PyUnicode_Tailmatch(PyObject *str,
9739 PyObject *substr,
9740 Py_ssize_t start,
9741 Py_ssize_t end,
9742 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009744 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009746
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009747 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748}
9749
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009750static PyObject *
9751ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009753 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9754 char *resdata, *data = PyUnicode_DATA(self);
9755 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009756
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 res = PyUnicode_New(len, 127);
9758 if (res == NULL)
9759 return NULL;
9760 resdata = PyUnicode_DATA(res);
9761 if (lower)
9762 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009764 _Py_bytes_upper(resdata, data, len);
9765 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766}
9767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009769handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009770{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009771 Py_ssize_t j;
9772 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009773 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009774 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009775
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9777
9778 where ! is a negation and \p{xxx} is a character with property xxx.
9779 */
9780 for (j = i - 1; j >= 0; j--) {
9781 c = PyUnicode_READ(kind, data, j);
9782 if (!_PyUnicode_IsCaseIgnorable(c))
9783 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009785 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9786 if (final_sigma) {
9787 for (j = i + 1; j < length; j++) {
9788 c = PyUnicode_READ(kind, data, j);
9789 if (!_PyUnicode_IsCaseIgnorable(c))
9790 break;
9791 }
9792 final_sigma = j == length || !_PyUnicode_IsCased(c);
9793 }
9794 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795}
9796
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797static int
9798lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9799 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009801 /* Obscure special case. */
9802 if (c == 0x3A3) {
9803 mapped[0] = handle_capital_sigma(kind, data, length, i);
9804 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009806 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807}
9808
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809static Py_ssize_t
9810do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812 Py_ssize_t i, k = 0;
9813 int n_res, j;
9814 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009815
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009817 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009819 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 for (i = 1; i < length; i++) {
9823 c = PyUnicode_READ(kind, data, i);
9824 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9825 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009826 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009827 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009828 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009829 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009830 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831}
9832
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833static Py_ssize_t
9834do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9835 Py_ssize_t i, k = 0;
9836
9837 for (i = 0; i < length; i++) {
9838 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9839 int n_res, j;
9840 if (Py_UNICODE_ISUPPER(c)) {
9841 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9842 }
9843 else if (Py_UNICODE_ISLOWER(c)) {
9844 n_res = _PyUnicode_ToUpperFull(c, mapped);
9845 }
9846 else {
9847 n_res = 1;
9848 mapped[0] = c;
9849 }
9850 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009851 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009852 res[k++] = mapped[j];
9853 }
9854 }
9855 return k;
9856}
9857
9858static Py_ssize_t
9859do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9860 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009862 Py_ssize_t i, k = 0;
9863
9864 for (i = 0; i < length; i++) {
9865 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9866 int n_res, j;
9867 if (lower)
9868 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9869 else
9870 n_res = _PyUnicode_ToUpperFull(c, mapped);
9871 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009872 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009873 res[k++] = mapped[j];
9874 }
9875 }
9876 return k;
9877}
9878
9879static Py_ssize_t
9880do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9881{
9882 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9883}
9884
9885static Py_ssize_t
9886do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887{
9888 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9889}
9890
Benjamin Petersone51757f2012-01-12 21:10:29 -05009891static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009892do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9893{
9894 Py_ssize_t i, k = 0;
9895
9896 for (i = 0; i < length; i++) {
9897 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9898 Py_UCS4 mapped[3];
9899 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9900 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009901 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009902 res[k++] = mapped[j];
9903 }
9904 }
9905 return k;
9906}
9907
9908static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009909do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9910{
9911 Py_ssize_t i, k = 0;
9912 int previous_is_cased;
9913
9914 previous_is_cased = 0;
9915 for (i = 0; i < length; i++) {
9916 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9917 Py_UCS4 mapped[3];
9918 int n_res, j;
9919
9920 if (previous_is_cased)
9921 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9922 else
9923 n_res = _PyUnicode_ToTitleFull(c, mapped);
9924
9925 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009926 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009927 res[k++] = mapped[j];
9928 }
9929
9930 previous_is_cased = _PyUnicode_IsCased(c);
9931 }
9932 return k;
9933}
9934
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009935static PyObject *
9936case_operation(PyObject *self,
9937 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9938{
9939 PyObject *res = NULL;
9940 Py_ssize_t length, newlength = 0;
9941 int kind, outkind;
9942 void *data, *outdata;
9943 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9944
Benjamin Petersoneea48462012-01-16 14:28:50 -05009945 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009946
9947 kind = PyUnicode_KIND(self);
9948 data = PyUnicode_DATA(self);
9949 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009950 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009951 PyErr_SetString(PyExc_OverflowError, "string is too long");
9952 return NULL;
9953 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009954 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009955 if (tmp == NULL)
9956 return PyErr_NoMemory();
9957 newlength = perform(kind, data, length, tmp, &maxchar);
9958 res = PyUnicode_New(newlength, maxchar);
9959 if (res == NULL)
9960 goto leave;
9961 tmpend = tmp + newlength;
9962 outdata = PyUnicode_DATA(res);
9963 outkind = PyUnicode_KIND(res);
9964 switch (outkind) {
9965 case PyUnicode_1BYTE_KIND:
9966 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9967 break;
9968 case PyUnicode_2BYTE_KIND:
9969 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9970 break;
9971 case PyUnicode_4BYTE_KIND:
9972 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9973 break;
9974 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009975 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009976 }
9977 leave:
9978 PyMem_FREE(tmp);
9979 return res;
9980}
9981
Tim Peters8ce9f162004-08-27 01:49:32 +00009982PyObject *
9983PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009985 PyObject *res;
9986 PyObject *fseq;
9987 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009988 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009990 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009991 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009992 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009993 }
9994
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009995 /* NOTE: the following code can't call back into Python code,
9996 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009997 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009998
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009999 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010000 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010001 res = _PyUnicode_JoinArray(separator, items, seqlen);
10002 Py_DECREF(fseq);
10003 return res;
10004}
10005
10006PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010007_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010008{
10009 PyObject *res = NULL; /* the result */
10010 PyObject *sep = NULL;
10011 Py_ssize_t seplen;
10012 PyObject *item;
10013 Py_ssize_t sz, i, res_offset;
10014 Py_UCS4 maxchar;
10015 Py_UCS4 item_maxchar;
10016 int use_memcpy;
10017 unsigned char *res_data = NULL, *sep_data = NULL;
10018 PyObject *last_obj;
10019 unsigned int kind = 0;
10020
Tim Peters05eba1f2004-08-27 21:32:02 +000010021 /* If empty sequence, return u"". */
10022 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010023 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010024 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010025
Tim Peters05eba1f2004-08-27 21:32:02 +000010026 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010027 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010028 if (seqlen == 1) {
10029 if (PyUnicode_CheckExact(items[0])) {
10030 res = items[0];
10031 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010032 return res;
10033 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010034 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010035 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010036 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010037 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010038 /* Set up sep and seplen */
10039 if (separator == NULL) {
10040 /* fall back to a blank space separator */
10041 sep = PyUnicode_FromOrdinal(' ');
10042 if (!sep)
10043 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010044 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010045 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010046 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010047 else {
10048 if (!PyUnicode_Check(separator)) {
10049 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010050 "separator: expected str instance,"
10051 " %.80s found",
10052 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010053 goto onError;
10054 }
10055 if (PyUnicode_READY(separator))
10056 goto onError;
10057 sep = separator;
10058 seplen = PyUnicode_GET_LENGTH(separator);
10059 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10060 /* inc refcount to keep this code path symmetric with the
10061 above case of a blank separator */
10062 Py_INCREF(sep);
10063 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010064 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010065 }
10066
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010067 /* There are at least two things to join, or else we have a subclass
10068 * of str in the sequence.
10069 * Do a pre-pass to figure out the total amount of space we'll
10070 * need (sz), and see whether all argument are strings.
10071 */
10072 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010073#ifdef Py_DEBUG
10074 use_memcpy = 0;
10075#else
10076 use_memcpy = 1;
10077#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010078 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010079 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010080 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 if (!PyUnicode_Check(item)) {
10082 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010083 "sequence item %zd: expected str instance,"
10084 " %.80s found",
10085 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010086 goto onError;
10087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 if (PyUnicode_READY(item) == -1)
10089 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010090 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010092 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010093 if (i != 0) {
10094 add_sz += seplen;
10095 }
10096 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010097 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010098 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010099 goto onError;
10100 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010101 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010102 if (use_memcpy && last_obj != NULL) {
10103 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10104 use_memcpy = 0;
10105 }
10106 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010107 }
Tim Petersced69f82003-09-16 20:30:58 +000010108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010110 if (res == NULL)
10111 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010112
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010113 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010114#ifdef Py_DEBUG
10115 use_memcpy = 0;
10116#else
10117 if (use_memcpy) {
10118 res_data = PyUnicode_1BYTE_DATA(res);
10119 kind = PyUnicode_KIND(res);
10120 if (seplen != 0)
10121 sep_data = PyUnicode_1BYTE_DATA(sep);
10122 }
10123#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010124 if (use_memcpy) {
10125 for (i = 0; i < seqlen; ++i) {
10126 Py_ssize_t itemlen;
10127 item = items[i];
10128
10129 /* Copy item, and maybe the separator. */
10130 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010131 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010132 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010133 kind * seplen);
10134 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010135 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010136
10137 itemlen = PyUnicode_GET_LENGTH(item);
10138 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010139 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010140 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 kind * itemlen);
10142 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010143 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010144 }
10145 assert(res_data == PyUnicode_1BYTE_DATA(res)
10146 + kind * PyUnicode_GET_LENGTH(res));
10147 }
10148 else {
10149 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10150 Py_ssize_t itemlen;
10151 item = items[i];
10152
10153 /* Copy item, and maybe the separator. */
10154 if (i && seplen != 0) {
10155 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10156 res_offset += seplen;
10157 }
10158
10159 itemlen = PyUnicode_GET_LENGTH(item);
10160 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010161 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010162 res_offset += itemlen;
10163 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010164 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010165 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010166 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010169 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171
Benjamin Peterson29060642009-01-31 22:14:21 +000010172 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010174 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 return NULL;
10176}
10177
Victor Stinnerd3f08822012-05-29 12:57:52 +020010178void
10179_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10180 Py_UCS4 fill_char)
10181{
10182 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010183 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010184 assert(PyUnicode_IS_READY(unicode));
10185 assert(unicode_modifiable(unicode));
10186 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10187 assert(start >= 0);
10188 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010189 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010190}
10191
Victor Stinner3fe55312012-01-04 00:33:50 +010010192Py_ssize_t
10193PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10194 Py_UCS4 fill_char)
10195{
10196 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010197
10198 if (!PyUnicode_Check(unicode)) {
10199 PyErr_BadInternalCall();
10200 return -1;
10201 }
10202 if (PyUnicode_READY(unicode) == -1)
10203 return -1;
10204 if (unicode_check_modifiable(unicode))
10205 return -1;
10206
Victor Stinnerd3f08822012-05-29 12:57:52 +020010207 if (start < 0) {
10208 PyErr_SetString(PyExc_IndexError, "string index out of range");
10209 return -1;
10210 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010211 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10212 PyErr_SetString(PyExc_ValueError,
10213 "fill character is bigger than "
10214 "the string maximum character");
10215 return -1;
10216 }
10217
10218 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10219 length = Py_MIN(maxlen, length);
10220 if (length <= 0)
10221 return 0;
10222
Victor Stinnerd3f08822012-05-29 12:57:52 +020010223 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010224 return length;
10225}
10226
Victor Stinner9310abb2011-10-05 00:59:23 +020010227static PyObject *
10228pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010229 Py_ssize_t left,
10230 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 PyObject *u;
10234 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010235 int kind;
10236 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237
10238 if (left < 0)
10239 left = 0;
10240 if (right < 0)
10241 right = 0;
10242
Victor Stinnerc4b49542011-12-11 22:44:26 +010010243 if (left == 0 && right == 0)
10244 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10247 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010248 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10249 return NULL;
10250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010252 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010254 if (!u)
10255 return NULL;
10256
10257 kind = PyUnicode_KIND(u);
10258 data = PyUnicode_DATA(u);
10259 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010260 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010261 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010262 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010263 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010264 assert(_PyUnicode_CheckConsistency(u, 1));
10265 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266}
10267
Alexander Belopolsky40018472011-02-26 01:02:56 +000010268PyObject *
10269PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010273 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
Benjamin Petersonead6b532011-12-20 17:23:42 -060010276 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010278 if (PyUnicode_IS_ASCII(string))
10279 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010280 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 PyUnicode_GET_LENGTH(string), keepends);
10282 else
10283 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010284 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010285 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 break;
10287 case PyUnicode_2BYTE_KIND:
10288 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 PyUnicode_GET_LENGTH(string), keepends);
10291 break;
10292 case PyUnicode_4BYTE_KIND:
10293 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 PyUnicode_GET_LENGTH(string), keepends);
10296 break;
10297 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010298 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301}
10302
Alexander Belopolsky40018472011-02-26 01:02:56 +000010303static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010304split(PyObject *self,
10305 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010306 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010308 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 void *buf1, *buf2;
10310 Py_ssize_t len1, len2;
10311 PyObject* out;
10312
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010314 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (PyUnicode_READY(self) == -1)
10317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010320 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010322 if (PyUnicode_IS_ASCII(self))
10323 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 PyUnicode_GET_LENGTH(self), maxcount
10326 );
10327 else
10328 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010329 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 PyUnicode_GET_LENGTH(self), maxcount
10331 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 case PyUnicode_2BYTE_KIND:
10333 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010334 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 PyUnicode_GET_LENGTH(self), maxcount
10336 );
10337 case PyUnicode_4BYTE_KIND:
10338 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 PyUnicode_GET_LENGTH(self), maxcount
10341 );
10342 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010343 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 }
10345
10346 if (PyUnicode_READY(substring) == -1)
10347 return NULL;
10348
10349 kind1 = PyUnicode_KIND(self);
10350 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 len1 = PyUnicode_GET_LENGTH(self);
10352 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010353 if (kind1 < kind2 || len1 < len2) {
10354 out = PyList_New(1);
10355 if (out == NULL)
10356 return NULL;
10357 Py_INCREF(self);
10358 PyList_SET_ITEM(out, 0, self);
10359 return out;
10360 }
10361 buf1 = PyUnicode_DATA(self);
10362 buf2 = PyUnicode_DATA(substring);
10363 if (kind2 != kind1) {
10364 buf2 = _PyUnicode_AsKind(substring, kind1);
10365 if (!buf2)
10366 return NULL;
10367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010369 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010371 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10372 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010373 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374 else
10375 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 break;
10378 case PyUnicode_2BYTE_KIND:
10379 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010380 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 break;
10382 case PyUnicode_4BYTE_KIND:
10383 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 break;
10386 default:
10387 out = NULL;
10388 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010389 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 PyMem_Free(buf2);
10391 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392}
10393
Alexander Belopolsky40018472011-02-26 01:02:56 +000010394static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010395rsplit(PyObject *self,
10396 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010397 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010398{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010399 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 void *buf1, *buf2;
10401 Py_ssize_t len1, len2;
10402 PyObject* out;
10403
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010404 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010405 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 if (PyUnicode_READY(self) == -1)
10408 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010411 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010413 if (PyUnicode_IS_ASCII(self))
10414 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010415 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010416 PyUnicode_GET_LENGTH(self), maxcount
10417 );
10418 else
10419 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010420 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421 PyUnicode_GET_LENGTH(self), maxcount
10422 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 case PyUnicode_2BYTE_KIND:
10424 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010425 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 PyUnicode_GET_LENGTH(self), maxcount
10427 );
10428 case PyUnicode_4BYTE_KIND:
10429 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010430 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 PyUnicode_GET_LENGTH(self), maxcount
10432 );
10433 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010434 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 }
10436
10437 if (PyUnicode_READY(substring) == -1)
10438 return NULL;
10439
10440 kind1 = PyUnicode_KIND(self);
10441 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 len1 = PyUnicode_GET_LENGTH(self);
10443 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010444 if (kind1 < kind2 || len1 < len2) {
10445 out = PyList_New(1);
10446 if (out == NULL)
10447 return NULL;
10448 Py_INCREF(self);
10449 PyList_SET_ITEM(out, 0, self);
10450 return out;
10451 }
10452 buf1 = PyUnicode_DATA(self);
10453 buf2 = PyUnicode_DATA(substring);
10454 if (kind2 != kind1) {
10455 buf2 = _PyUnicode_AsKind(substring, kind1);
10456 if (!buf2)
10457 return NULL;
10458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010460 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010462 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10463 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010464 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010465 else
10466 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010467 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 break;
10469 case PyUnicode_2BYTE_KIND:
10470 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010471 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 break;
10473 case PyUnicode_4BYTE_KIND:
10474 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010475 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 break;
10477 default:
10478 out = NULL;
10479 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010480 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 PyMem_Free(buf2);
10482 return out;
10483}
10484
10485static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010486anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10487 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010489 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010491 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10492 return asciilib_find(buf1, len1, buf2, len2, offset);
10493 else
10494 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 case PyUnicode_2BYTE_KIND:
10496 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10497 case PyUnicode_4BYTE_KIND:
10498 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10499 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010500 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501}
10502
10503static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010504anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10505 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010507 switch (kind) {
10508 case PyUnicode_1BYTE_KIND:
10509 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10510 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10511 else
10512 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10513 case PyUnicode_2BYTE_KIND:
10514 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10515 case PyUnicode_4BYTE_KIND:
10516 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10517 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010518 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010519}
10520
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010521static void
10522replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10523 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10524{
10525 int kind = PyUnicode_KIND(u);
10526 void *data = PyUnicode_DATA(u);
10527 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10528 if (kind == PyUnicode_1BYTE_KIND) {
10529 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10530 (Py_UCS1 *)data + len,
10531 u1, u2, maxcount);
10532 }
10533 else if (kind == PyUnicode_2BYTE_KIND) {
10534 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10535 (Py_UCS2 *)data + len,
10536 u1, u2, maxcount);
10537 }
10538 else {
10539 assert(kind == PyUnicode_4BYTE_KIND);
10540 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10541 (Py_UCS4 *)data + len,
10542 u1, u2, maxcount);
10543 }
10544}
10545
Alexander Belopolsky40018472011-02-26 01:02:56 +000010546static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547replace(PyObject *self, PyObject *str1,
10548 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 PyObject *u;
10551 char *sbuf = PyUnicode_DATA(self);
10552 char *buf1 = PyUnicode_DATA(str1);
10553 char *buf2 = PyUnicode_DATA(str2);
10554 int srelease = 0, release1 = 0, release2 = 0;
10555 int skind = PyUnicode_KIND(self);
10556 int kind1 = PyUnicode_KIND(str1);
10557 int kind2 = PyUnicode_KIND(str2);
10558 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10559 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10560 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010561 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010562 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563
10564 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010565 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010567 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568
Victor Stinner59de0ee2011-10-07 10:01:28 +020010569 if (str1 == str2)
10570 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010573 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10574 if (maxchar < maxchar_str1)
10575 /* substring too wide to be present */
10576 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010577 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10578 /* Replacing str1 with str2 may cause a maxchar reduction in the
10579 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010580 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010581 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010584 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010586 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010588 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010589 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010590 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010591
Victor Stinner69ed0f42013-04-09 21:48:24 +020010592 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010593 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010594 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010596 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010598 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010600
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010601 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10602 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 }
10604 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 int rkind = skind;
10606 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010607 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (kind1 < rkind) {
10610 /* widen substring */
10611 buf1 = _PyUnicode_AsKind(str1, rkind);
10612 if (!buf1) goto error;
10613 release1 = 1;
10614 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010615 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616 if (i < 0)
10617 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (rkind > kind2) {
10619 /* widen replacement */
10620 buf2 = _PyUnicode_AsKind(str2, rkind);
10621 if (!buf2) goto error;
10622 release2 = 1;
10623 }
10624 else if (rkind < kind2) {
10625 /* widen self and buf1 */
10626 rkind = kind2;
10627 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010628 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 sbuf = _PyUnicode_AsKind(self, rkind);
10630 if (!sbuf) goto error;
10631 srelease = 1;
10632 buf1 = _PyUnicode_AsKind(str1, rkind);
10633 if (!buf1) goto error;
10634 release1 = 1;
10635 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 u = PyUnicode_New(slen, maxchar);
10637 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010639 assert(PyUnicode_KIND(u) == rkind);
10640 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010641
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010643 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010644 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010646 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010648
10649 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010650 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010651 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010652 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010653 if (i == -1)
10654 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010659 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010660 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010661 }
10662 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010664 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 int rkind = skind;
10666 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010669 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 buf1 = _PyUnicode_AsKind(str1, rkind);
10671 if (!buf1) goto error;
10672 release1 = 1;
10673 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010674 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 if (n == 0)
10676 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 buf2 = _PyUnicode_AsKind(str2, rkind);
10680 if (!buf2) goto error;
10681 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010684 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 rkind = kind2;
10686 sbuf = _PyUnicode_AsKind(self, rkind);
10687 if (!sbuf) goto error;
10688 srelease = 1;
10689 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010690 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 buf1 = _PyUnicode_AsKind(str1, rkind);
10692 if (!buf1) goto error;
10693 release1 = 1;
10694 }
10695 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10696 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010697 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 PyErr_SetString(PyExc_OverflowError,
10699 "replace string is too long");
10700 goto error;
10701 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010702 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010703 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010704 _Py_INCREF_UNICODE_EMPTY();
10705 if (!unicode_empty)
10706 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010707 u = unicode_empty;
10708 goto done;
10709 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010710 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 PyErr_SetString(PyExc_OverflowError,
10712 "replace string is too long");
10713 goto error;
10714 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010715 u = PyUnicode_New(new_size, maxchar);
10716 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010718 assert(PyUnicode_KIND(u) == rkind);
10719 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 ires = i = 0;
10721 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 while (n-- > 0) {
10723 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010724 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010725 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010726 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010727 if (j == -1)
10728 break;
10729 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010730 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010731 memcpy(res + rkind * ires,
10732 sbuf + rkind * i,
10733 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010735 }
10736 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010738 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010740 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010747 memcpy(res + rkind * ires,
10748 sbuf + rkind * i,
10749 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010750 }
10751 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752 /* interleave */
10753 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010754 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010758 if (--n <= 0)
10759 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010760 memcpy(res + rkind * ires,
10761 sbuf + rkind * i,
10762 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 ires++;
10764 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010765 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010766 memcpy(res + rkind * ires,
10767 sbuf + rkind * i,
10768 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010769 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010770 }
10771
10772 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010773 unicode_adjust_maxchar(&u);
10774 if (u == NULL)
10775 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010777
10778 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 if (srelease)
10780 PyMem_FREE(sbuf);
10781 if (release1)
10782 PyMem_FREE(buf1);
10783 if (release2)
10784 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010785 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010787
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010789 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (srelease)
10791 PyMem_FREE(sbuf);
10792 if (release1)
10793 PyMem_FREE(buf1);
10794 if (release2)
10795 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010796 return unicode_result_unchanged(self);
10797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 error:
10799 if (srelease && sbuf)
10800 PyMem_FREE(sbuf);
10801 if (release1 && buf1)
10802 PyMem_FREE(buf1);
10803 if (release2 && buf2)
10804 PyMem_FREE(buf2);
10805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806}
10807
10808/* --- Unicode Object Methods --------------------------------------------- */
10809
INADA Naoki3ae20562017-01-16 20:41:20 +090010810/*[clinic input]
10811str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
INADA Naoki3ae20562017-01-16 20:41:20 +090010813Return a version of the string where each word is titlecased.
10814
10815More specifically, words start with uppercased characters and all remaining
10816cased characters have lower case.
10817[clinic start generated code]*/
10818
10819static PyObject *
10820unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010821/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010823 if (PyUnicode_READY(self) == -1)
10824 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010825 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826}
10827
INADA Naoki3ae20562017-01-16 20:41:20 +090010828/*[clinic input]
10829str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
INADA Naoki3ae20562017-01-16 20:41:20 +090010831Return a capitalized version of the string.
10832
10833More specifically, make the first character have upper case and the rest lower
10834case.
10835[clinic start generated code]*/
10836
10837static PyObject *
10838unicode_capitalize_impl(PyObject *self)
10839/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010841 if (PyUnicode_READY(self) == -1)
10842 return NULL;
10843 if (PyUnicode_GET_LENGTH(self) == 0)
10844 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010845 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846}
10847
INADA Naoki3ae20562017-01-16 20:41:20 +090010848/*[clinic input]
10849str.casefold as unicode_casefold
10850
10851Return a version of the string suitable for caseless comparisons.
10852[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010853
10854static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010855unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010856/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010857{
10858 if (PyUnicode_READY(self) == -1)
10859 return NULL;
10860 if (PyUnicode_IS_ASCII(self))
10861 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010862 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010863}
10864
10865
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010866/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010867
10868static int
10869convert_uc(PyObject *obj, void *addr)
10870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010871 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010872
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010873 if (!PyUnicode_Check(obj)) {
10874 PyErr_Format(PyExc_TypeError,
10875 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010876 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877 return 0;
10878 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010879 if (PyUnicode_READY(obj) < 0)
10880 return 0;
10881 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010882 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010884 return 0;
10885 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010886 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010887 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010888}
10889
INADA Naoki3ae20562017-01-16 20:41:20 +090010890/*[clinic input]
10891str.center as unicode_center
10892
10893 width: Py_ssize_t
10894 fillchar: Py_UCS4 = ' '
10895 /
10896
10897Return a centered string of length width.
10898
10899Padding is done using the specified fill character (default is a space).
10900[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901
10902static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010903unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10904/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010906 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907
Benjamin Petersonbac79492012-01-14 13:34:47 -050010908 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909 return NULL;
10910
Victor Stinnerc4b49542011-12-11 22:44:26 +010010911 if (PyUnicode_GET_LENGTH(self) >= width)
10912 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
Victor Stinnerc4b49542011-12-11 22:44:26 +010010914 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915 left = marg / 2 + (marg & width & 1);
10916
Victor Stinner9310abb2011-10-05 00:59:23 +020010917 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918}
10919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010920/* This function assumes that str1 and str2 are readied by the caller. */
10921
Marc-André Lemburge5034372000-08-08 08:04:29 +000010922static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010923unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010924{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010925#define COMPARE(TYPE1, TYPE2) \
10926 do { \
10927 TYPE1* p1 = (TYPE1 *)data1; \
10928 TYPE2* p2 = (TYPE2 *)data2; \
10929 TYPE1* end = p1 + len; \
10930 Py_UCS4 c1, c2; \
10931 for (; p1 != end; p1++, p2++) { \
10932 c1 = *p1; \
10933 c2 = *p2; \
10934 if (c1 != c2) \
10935 return (c1 < c2) ? -1 : 1; \
10936 } \
10937 } \
10938 while (0)
10939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 int kind1, kind2;
10941 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010942 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 kind1 = PyUnicode_KIND(str1);
10945 kind2 = PyUnicode_KIND(str2);
10946 data1 = PyUnicode_DATA(str1);
10947 data2 = PyUnicode_DATA(str2);
10948 len1 = PyUnicode_GET_LENGTH(str1);
10949 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010950 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010951
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 switch(kind1) {
10953 case PyUnicode_1BYTE_KIND:
10954 {
10955 switch(kind2) {
10956 case PyUnicode_1BYTE_KIND:
10957 {
10958 int cmp = memcmp(data1, data2, len);
10959 /* normalize result of memcmp() into the range [-1; 1] */
10960 if (cmp < 0)
10961 return -1;
10962 if (cmp > 0)
10963 return 1;
10964 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010965 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010966 case PyUnicode_2BYTE_KIND:
10967 COMPARE(Py_UCS1, Py_UCS2);
10968 break;
10969 case PyUnicode_4BYTE_KIND:
10970 COMPARE(Py_UCS1, Py_UCS4);
10971 break;
10972 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010973 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010974 }
10975 break;
10976 }
10977 case PyUnicode_2BYTE_KIND:
10978 {
10979 switch(kind2) {
10980 case PyUnicode_1BYTE_KIND:
10981 COMPARE(Py_UCS2, Py_UCS1);
10982 break;
10983 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010984 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010985 COMPARE(Py_UCS2, Py_UCS2);
10986 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010987 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010988 case PyUnicode_4BYTE_KIND:
10989 COMPARE(Py_UCS2, Py_UCS4);
10990 break;
10991 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010992 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010993 }
10994 break;
10995 }
10996 case PyUnicode_4BYTE_KIND:
10997 {
10998 switch(kind2) {
10999 case PyUnicode_1BYTE_KIND:
11000 COMPARE(Py_UCS4, Py_UCS1);
11001 break;
11002 case PyUnicode_2BYTE_KIND:
11003 COMPARE(Py_UCS4, Py_UCS2);
11004 break;
11005 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011006 {
11007#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11008 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11009 /* normalize result of wmemcmp() into the range [-1; 1] */
11010 if (cmp < 0)
11011 return -1;
11012 if (cmp > 0)
11013 return 1;
11014#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011015 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011016#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011017 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011018 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011019 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011020 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011021 }
11022 break;
11023 }
11024 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011025 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011026 }
11027
Victor Stinner770e19e2012-10-04 22:59:45 +020011028 if (len1 == len2)
11029 return 0;
11030 if (len1 < len2)
11031 return -1;
11032 else
11033 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011034
11035#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011036}
11037
Benjamin Peterson621b4302016-09-09 13:54:34 -070011038static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011039unicode_compare_eq(PyObject *str1, PyObject *str2)
11040{
11041 int kind;
11042 void *data1, *data2;
11043 Py_ssize_t len;
11044 int cmp;
11045
Victor Stinnere5567ad2012-10-23 02:48:49 +020011046 len = PyUnicode_GET_LENGTH(str1);
11047 if (PyUnicode_GET_LENGTH(str2) != len)
11048 return 0;
11049 kind = PyUnicode_KIND(str1);
11050 if (PyUnicode_KIND(str2) != kind)
11051 return 0;
11052 data1 = PyUnicode_DATA(str1);
11053 data2 = PyUnicode_DATA(str2);
11054
11055 cmp = memcmp(data1, data2, len * kind);
11056 return (cmp == 0);
11057}
11058
11059
Alexander Belopolsky40018472011-02-26 01:02:56 +000011060int
11061PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11064 if (PyUnicode_READY(left) == -1 ||
11065 PyUnicode_READY(right) == -1)
11066 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011067
11068 /* a string is equal to itself */
11069 if (left == right)
11070 return 0;
11071
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011072 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011074 PyErr_Format(PyExc_TypeError,
11075 "Can't compare %.100s and %.100s",
11076 left->ob_type->tp_name,
11077 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078 return -1;
11079}
11080
Martin v. Löwis5b222132007-06-10 09:51:05 +000011081int
11082PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 Py_ssize_t i;
11085 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011087 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088
Victor Stinner910337b2011-10-03 03:20:16 +020011089 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011090 if (!PyUnicode_IS_READY(uni)) {
11091 const wchar_t *ws = _PyUnicode_WSTR(uni);
11092 /* Compare Unicode string and source character set string */
11093 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11094 if (chr != ustr[i])
11095 return (chr < ustr[i]) ? -1 : 1;
11096 }
11097 /* This check keeps Python strings that end in '\0' from comparing equal
11098 to C strings identical up to that point. */
11099 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11100 return 1; /* uni is longer */
11101 if (ustr[i])
11102 return -1; /* str is longer */
11103 return 0;
11104 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011106 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011107 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011108 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011109 size_t len, len2 = strlen(str);
11110 int cmp;
11111
11112 len = Py_MIN(len1, len2);
11113 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011114 if (cmp != 0) {
11115 if (cmp < 0)
11116 return -1;
11117 else
11118 return 1;
11119 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011120 if (len1 > len2)
11121 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011122 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011123 return -1; /* str is longer */
11124 return 0;
11125 }
11126 else {
11127 void *data = PyUnicode_DATA(uni);
11128 /* Compare Unicode string and source character set string */
11129 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011130 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011131 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11132 /* This check keeps Python strings that end in '\0' from comparing equal
11133 to C strings identical up to that point. */
11134 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11135 return 1; /* uni is longer */
11136 if (str[i])
11137 return -1; /* str is longer */
11138 return 0;
11139 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011140}
11141
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011142static int
11143non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11144{
11145 size_t i, len;
11146 const wchar_t *p;
11147 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11148 if (strlen(str) != len)
11149 return 0;
11150 p = _PyUnicode_WSTR(unicode);
11151 assert(p);
11152 for (i = 0; i < len; i++) {
11153 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011154 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011155 return 0;
11156 }
11157 return 1;
11158}
11159
11160int
11161_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11162{
11163 size_t len;
11164 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011165 assert(str);
11166#ifndef NDEBUG
11167 for (const char *p = str; *p; p++) {
11168 assert((unsigned char)*p < 128);
11169 }
11170#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011171 if (PyUnicode_READY(unicode) == -1) {
11172 /* Memory error or bad data */
11173 PyErr_Clear();
11174 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11175 }
11176 if (!PyUnicode_IS_ASCII(unicode))
11177 return 0;
11178 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11179 return strlen(str) == len &&
11180 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11181}
11182
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011183int
11184_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11185{
11186 PyObject *right_uni;
11187 Py_hash_t hash;
11188
11189 assert(_PyUnicode_CHECK(left));
11190 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011191#ifndef NDEBUG
11192 for (const char *p = right->string; *p; p++) {
11193 assert((unsigned char)*p < 128);
11194 }
11195#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011196
11197 if (PyUnicode_READY(left) == -1) {
11198 /* memory error or bad data */
11199 PyErr_Clear();
11200 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11201 }
11202
11203 if (!PyUnicode_IS_ASCII(left))
11204 return 0;
11205
11206 right_uni = _PyUnicode_FromId(right); /* borrowed */
11207 if (right_uni == NULL) {
11208 /* memory error or bad data */
11209 PyErr_Clear();
11210 return _PyUnicode_EqualToASCIIString(left, right->string);
11211 }
11212
11213 if (left == right_uni)
11214 return 1;
11215
11216 if (PyUnicode_CHECK_INTERNED(left))
11217 return 0;
11218
INADA Naoki7cc95f52018-01-28 02:07:09 +090011219 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011220 hash = _PyUnicode_HASH(left);
11221 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11222 return 0;
11223
11224 return unicode_compare_eq(left, right_uni);
11225}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011226
Alexander Belopolsky40018472011-02-26 01:02:56 +000011227PyObject *
11228PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011229{
11230 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011231
Victor Stinnere5567ad2012-10-23 02:48:49 +020011232 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11233 Py_RETURN_NOTIMPLEMENTED;
11234
11235 if (PyUnicode_READY(left) == -1 ||
11236 PyUnicode_READY(right) == -1)
11237 return NULL;
11238
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011239 if (left == right) {
11240 switch (op) {
11241 case Py_EQ:
11242 case Py_LE:
11243 case Py_GE:
11244 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011245 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011246 case Py_NE:
11247 case Py_LT:
11248 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011249 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011250 default:
11251 PyErr_BadArgument();
11252 return NULL;
11253 }
11254 }
11255 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011256 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011257 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011258 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011259 }
11260 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011261 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011262 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011263 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011264}
11265
Alexander Belopolsky40018472011-02-26 01:02:56 +000011266int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011267_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11268{
11269 return unicode_eq(aa, bb);
11270}
11271
11272int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011273PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011274{
Victor Stinner77282cb2013-04-14 19:22:47 +020011275 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 void *buf1, *buf2;
11277 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011278 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011279
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011280 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011282 "'in <string>' requires string as left operand, not %.100s",
11283 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011284 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011285 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011286 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011287 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011288 if (ensure_unicode(str) < 0)
11289 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011292 kind2 = PyUnicode_KIND(substr);
11293 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011294 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 len2 = PyUnicode_GET_LENGTH(substr);
11297 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011298 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011299 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011300 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011301 if (len2 == 1) {
11302 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11303 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011304 return result;
11305 }
11306 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011307 buf2 = _PyUnicode_AsKind(substr, kind1);
11308 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011309 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311
Victor Stinner77282cb2013-04-14 19:22:47 +020011312 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 case PyUnicode_1BYTE_KIND:
11314 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11315 break;
11316 case PyUnicode_2BYTE_KIND:
11317 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11318 break;
11319 case PyUnicode_4BYTE_KIND:
11320 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11321 break;
11322 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011323 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011325
Victor Stinner77282cb2013-04-14 19:22:47 +020011326 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 PyMem_Free(buf2);
11328
Guido van Rossum403d68b2000-03-13 15:55:09 +000011329 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011330}
11331
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332/* Concat to string or Unicode object giving a new Unicode object. */
11333
Alexander Belopolsky40018472011-02-26 01:02:56 +000011334PyObject *
11335PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011337 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011338 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011339 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011341 if (ensure_unicode(left) < 0)
11342 return NULL;
11343
11344 if (!PyUnicode_Check(right)) {
11345 PyErr_Format(PyExc_TypeError,
11346 "can only concatenate str (not \"%.200s\") to str",
11347 right->ob_type->tp_name);
11348 return NULL;
11349 }
11350 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
11353 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011354 if (left == unicode_empty)
11355 return PyUnicode_FromObject(right);
11356 if (right == unicode_empty)
11357 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011359 left_len = PyUnicode_GET_LENGTH(left);
11360 right_len = PyUnicode_GET_LENGTH(right);
11361 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011362 PyErr_SetString(PyExc_OverflowError,
11363 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011364 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011365 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011366 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011367
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11369 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011370 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 result = PyUnicode_New(new_len, maxchar);
11374 if (result == NULL)
11375 return NULL;
11376 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11377 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11378 assert(_PyUnicode_CheckConsistency(result, 1));
11379 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380}
11381
Walter Dörwald1ab83302007-05-18 17:15:44 +000011382void
Victor Stinner23e56682011-10-03 03:54:37 +020011383PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011384{
Victor Stinner23e56682011-10-03 03:54:37 +020011385 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011386 Py_UCS4 maxchar, maxchar2;
11387 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011388
11389 if (p_left == NULL) {
11390 if (!PyErr_Occurred())
11391 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011392 return;
11393 }
Victor Stinner23e56682011-10-03 03:54:37 +020011394 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011395 if (right == NULL || left == NULL
11396 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011397 if (!PyErr_Occurred())
11398 PyErr_BadInternalCall();
11399 goto error;
11400 }
11401
Benjamin Petersonbac79492012-01-14 13:34:47 -050011402 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011403 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011404 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011405 goto error;
11406
Victor Stinner488fa492011-12-12 00:01:39 +010011407 /* Shortcuts */
11408 if (left == unicode_empty) {
11409 Py_DECREF(left);
11410 Py_INCREF(right);
11411 *p_left = right;
11412 return;
11413 }
11414 if (right == unicode_empty)
11415 return;
11416
11417 left_len = PyUnicode_GET_LENGTH(left);
11418 right_len = PyUnicode_GET_LENGTH(right);
11419 if (left_len > PY_SSIZE_T_MAX - right_len) {
11420 PyErr_SetString(PyExc_OverflowError,
11421 "strings are too large to concat");
11422 goto error;
11423 }
11424 new_len = left_len + right_len;
11425
11426 if (unicode_modifiable(left)
11427 && PyUnicode_CheckExact(right)
11428 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011429 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11430 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011431 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011432 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011433 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11434 {
11435 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011436 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011437 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011438
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011439 /* copy 'right' into the newly allocated area of 'left' */
11440 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011441 }
Victor Stinner488fa492011-12-12 00:01:39 +010011442 else {
11443 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11444 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011445 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011446
Victor Stinner488fa492011-12-12 00:01:39 +010011447 /* Concat the two Unicode strings */
11448 res = PyUnicode_New(new_len, maxchar);
11449 if (res == NULL)
11450 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011451 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11452 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011453 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011454 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011455 }
11456 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011457 return;
11458
11459error:
Victor Stinner488fa492011-12-12 00:01:39 +010011460 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011461}
11462
11463void
11464PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11465{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011466 PyUnicode_Append(pleft, right);
11467 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011468}
11469
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011470/*
11471Wraps stringlib_parse_args_finds() and additionally ensures that the
11472first argument is a unicode object.
11473*/
11474
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011475static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011476parse_args_finds_unicode(const char * function_name, PyObject *args,
11477 PyObject **substring,
11478 Py_ssize_t *start, Py_ssize_t *end)
11479{
11480 if(stringlib_parse_args_finds(function_name, args, substring,
11481 start, end)) {
11482 if (ensure_unicode(*substring) < 0)
11483 return 0;
11484 return 1;
11485 }
11486 return 0;
11487}
11488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011489PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011492Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011493string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011494interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011497unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011499 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011500 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011501 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011503 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 void *buf1, *buf2;
11505 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011507 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 kind1 = PyUnicode_KIND(self);
11511 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011512 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011513 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 len1 = PyUnicode_GET_LENGTH(self);
11516 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011518 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011519 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011520
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011521 buf1 = PyUnicode_DATA(self);
11522 buf2 = PyUnicode_DATA(substring);
11523 if (kind2 != kind1) {
11524 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011525 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011526 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011527 }
11528 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 case PyUnicode_1BYTE_KIND:
11530 iresult = ucs1lib_count(
11531 ((Py_UCS1*)buf1) + start, end - start,
11532 buf2, len2, PY_SSIZE_T_MAX
11533 );
11534 break;
11535 case PyUnicode_2BYTE_KIND:
11536 iresult = ucs2lib_count(
11537 ((Py_UCS2*)buf1) + start, end - start,
11538 buf2, len2, PY_SSIZE_T_MAX
11539 );
11540 break;
11541 case PyUnicode_4BYTE_KIND:
11542 iresult = ucs4lib_count(
11543 ((Py_UCS4*)buf1) + start, end - start,
11544 buf2, len2, PY_SSIZE_T_MAX
11545 );
11546 break;
11547 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011548 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 }
11550
11551 result = PyLong_FromSsize_t(iresult);
11552
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011553 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 return result;
11557}
11558
INADA Naoki3ae20562017-01-16 20:41:20 +090011559/*[clinic input]
11560str.encode as unicode_encode
11561
11562 encoding: str(c_default="NULL") = 'utf-8'
11563 The encoding in which to encode the string.
11564 errors: str(c_default="NULL") = 'strict'
11565 The error handling scheme to use for encoding errors.
11566 The default is 'strict' meaning that encoding errors raise a
11567 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11568 'xmlcharrefreplace' as well as any other name registered with
11569 codecs.register_error that can handle UnicodeEncodeErrors.
11570
11571Encode the string using the codec registered for encoding.
11572[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
11574static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011575unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011576/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011578 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011579}
11580
INADA Naoki3ae20562017-01-16 20:41:20 +090011581/*[clinic input]
11582str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583
INADA Naoki3ae20562017-01-16 20:41:20 +090011584 tabsize: int = 8
11585
11586Return a copy where all tab characters are expanded using spaces.
11587
11588If tabsize is not given, a tab size of 8 characters is assumed.
11589[clinic start generated code]*/
11590
11591static PyObject *
11592unicode_expandtabs_impl(PyObject *self, int tabsize)
11593/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011595 Py_ssize_t i, j, line_pos, src_len, incr;
11596 Py_UCS4 ch;
11597 PyObject *u;
11598 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011600 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
Antoine Pitrou22425222011-10-04 19:10:51 +020011602 if (PyUnicode_READY(self) == -1)
11603 return NULL;
11604
Thomas Wouters7e474022000-07-16 12:04:32 +000011605 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011606 src_len = PyUnicode_GET_LENGTH(self);
11607 i = j = line_pos = 0;
11608 kind = PyUnicode_KIND(self);
11609 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011610 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011611 for (; i < src_len; i++) {
11612 ch = PyUnicode_READ(kind, src_data, i);
11613 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011614 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011616 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011618 goto overflow;
11619 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011621 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011625 goto overflow;
11626 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011628 if (ch == '\n' || ch == '\r')
11629 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011631 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011632 if (!found)
11633 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011634
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011636 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 if (!u)
11638 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011639 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
Antoine Pitroue71d5742011-10-04 15:55:09 +020011641 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
Antoine Pitroue71d5742011-10-04 15:55:09 +020011643 for (; i < src_len; i++) {
11644 ch = PyUnicode_READ(kind, src_data, i);
11645 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011647 incr = tabsize - (line_pos % tabsize);
11648 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011649 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011650 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011652 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011654 line_pos++;
11655 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011656 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011657 if (ch == '\n' || ch == '\r')
11658 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011660 }
11661 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011662 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011663
Antoine Pitroue71d5742011-10-04 15:55:09 +020011664 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011665 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667}
11668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011669PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011670 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671\n\
11672Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011673such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674arguments start and end are interpreted as in slice notation.\n\
11675\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011676Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
11678static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011681 /* initialize variables to prevent gcc warning */
11682 PyObject *substring = NULL;
11683 Py_ssize_t start = 0;
11684 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011685 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011687 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011690 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011693 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 if (result == -2)
11696 return NULL;
11697
Christian Heimes217cfd12007-12-02 14:31:20 +000011698 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699}
11700
11701static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011702unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011704 void *data;
11705 enum PyUnicode_Kind kind;
11706 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011707
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011708 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011709 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011711 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011712 if (PyUnicode_READY(self) == -1) {
11713 return NULL;
11714 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011715 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11716 PyErr_SetString(PyExc_IndexError, "string index out of range");
11717 return NULL;
11718 }
11719 kind = PyUnicode_KIND(self);
11720 data = PyUnicode_DATA(self);
11721 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011722 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723}
11724
Guido van Rossumc2504932007-09-18 19:42:40 +000011725/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011726 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011727static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011730 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011731
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011732#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011733 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011734#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 if (_PyUnicode_HASH(self) != -1)
11736 return _PyUnicode_HASH(self);
11737 if (PyUnicode_READY(self) == -1)
11738 return -1;
animalizea1d14252019-01-02 20:16:06 +080011739
Christian Heimes985ecdc2013-11-20 11:46:18 +010011740 x = _Py_HashBytes(PyUnicode_DATA(self),
11741 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011743 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744}
11745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011746PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748\n\
oldkaa0735f2018-02-02 16:52:55 +080011749Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011750such that sub is contained within S[start:end]. Optional\n\
11751arguments start and end are interpreted as in slice notation.\n\
11752\n\
11753Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
11755static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011758 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011759 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011760 PyObject *substring = NULL;
11761 Py_ssize_t start = 0;
11762 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011764 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011767 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011770 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (result == -2)
11773 return NULL;
11774
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 if (result < 0) {
11776 PyErr_SetString(PyExc_ValueError, "substring not found");
11777 return NULL;
11778 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011779
Christian Heimes217cfd12007-12-02 14:31:20 +000011780 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781}
11782
INADA Naoki3ae20562017-01-16 20:41:20 +090011783/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011784str.isascii as unicode_isascii
11785
11786Return True if all characters in the string are ASCII, False otherwise.
11787
11788ASCII characters have code points in the range U+0000-U+007F.
11789Empty string is ASCII too.
11790[clinic start generated code]*/
11791
11792static PyObject *
11793unicode_isascii_impl(PyObject *self)
11794/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11795{
11796 if (PyUnicode_READY(self) == -1) {
11797 return NULL;
11798 }
11799 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11800}
11801
11802/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011803str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804
INADA Naoki3ae20562017-01-16 20:41:20 +090011805Return True if the string is a lowercase string, False otherwise.
11806
11807A string is lowercase if all cased characters in the string are lowercase and
11808there is at least one cased character in the string.
11809[clinic start generated code]*/
11810
11811static PyObject *
11812unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011813/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 Py_ssize_t i, length;
11816 int kind;
11817 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 int cased;
11819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 if (PyUnicode_READY(self) == -1)
11821 return NULL;
11822 length = PyUnicode_GET_LENGTH(self);
11823 kind = PyUnicode_KIND(self);
11824 data = PyUnicode_DATA(self);
11825
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 if (length == 1)
11828 return PyBool_FromLong(
11829 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011831 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011833 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 for (i = 0; i < length; i++) {
11837 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011838
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011840 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 else if (!cased && Py_UNICODE_ISLOWER(ch))
11842 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011844 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845}
11846
INADA Naoki3ae20562017-01-16 20:41:20 +090011847/*[clinic input]
11848str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849
INADA Naoki3ae20562017-01-16 20:41:20 +090011850Return True if the string is an uppercase string, False otherwise.
11851
11852A string is uppercase if all cased characters in the string are uppercase and
11853there is at least one cased character in the string.
11854[clinic start generated code]*/
11855
11856static PyObject *
11857unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011858/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 Py_ssize_t i, length;
11861 int kind;
11862 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 int cased;
11864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (PyUnicode_READY(self) == -1)
11866 return NULL;
11867 length = PyUnicode_GET_LENGTH(self);
11868 kind = PyUnicode_KIND(self);
11869 data = PyUnicode_DATA(self);
11870
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (length == 1)
11873 return PyBool_FromLong(
11874 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011876 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011878 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011879
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 for (i = 0; i < length; i++) {
11882 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011883
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011885 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 else if (!cased && Py_UNICODE_ISUPPER(ch))
11887 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011889 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890}
11891
INADA Naoki3ae20562017-01-16 20:41:20 +090011892/*[clinic input]
11893str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894
INADA Naoki3ae20562017-01-16 20:41:20 +090011895Return True if the string is a title-cased string, False otherwise.
11896
11897In a title-cased string, upper- and title-case characters may only
11898follow uncased characters and lowercase characters only cased ones.
11899[clinic start generated code]*/
11900
11901static PyObject *
11902unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011903/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 Py_ssize_t i, length;
11906 int kind;
11907 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 int cased, previous_is_cased;
11909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 if (PyUnicode_READY(self) == -1)
11911 return NULL;
11912 length = PyUnicode_GET_LENGTH(self);
11913 kind = PyUnicode_KIND(self);
11914 data = PyUnicode_DATA(self);
11915
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 if (length == 1) {
11918 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11919 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11920 (Py_UNICODE_ISUPPER(ch) != 0));
11921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011923 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011925 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011926
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 cased = 0;
11928 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 for (i = 0; i < length; i++) {
11930 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011931
Benjamin Peterson29060642009-01-31 22:14:21 +000011932 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11933 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 previous_is_cased = 1;
11936 cased = 1;
11937 }
11938 else if (Py_UNICODE_ISLOWER(ch)) {
11939 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011940 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 previous_is_cased = 1;
11942 cased = 1;
11943 }
11944 else
11945 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011947 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948}
11949
INADA Naoki3ae20562017-01-16 20:41:20 +090011950/*[clinic input]
11951str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952
INADA Naoki3ae20562017-01-16 20:41:20 +090011953Return True if the string is a whitespace string, False otherwise.
11954
11955A string is whitespace if all characters in the string are whitespace and there
11956is at least one character in the string.
11957[clinic start generated code]*/
11958
11959static PyObject *
11960unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011961/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 Py_ssize_t i, length;
11964 int kind;
11965 void *data;
11966
11967 if (PyUnicode_READY(self) == -1)
11968 return NULL;
11969 length = PyUnicode_GET_LENGTH(self);
11970 kind = PyUnicode_KIND(self);
11971 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (length == 1)
11975 return PyBool_FromLong(
11976 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011978 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011980 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 for (i = 0; i < length; i++) {
11983 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011984 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011985 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011987 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988}
11989
INADA Naoki3ae20562017-01-16 20:41:20 +090011990/*[clinic input]
11991str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011992
INADA Naoki3ae20562017-01-16 20:41:20 +090011993Return True if the string is an alphabetic string, False otherwise.
11994
11995A string is alphabetic if all characters in the string are alphabetic and there
11996is at least one character in the string.
11997[clinic start generated code]*/
11998
11999static PyObject *
12000unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012001/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 Py_ssize_t i, length;
12004 int kind;
12005 void *data;
12006
12007 if (PyUnicode_READY(self) == -1)
12008 return NULL;
12009 length = PyUnicode_GET_LENGTH(self);
12010 kind = PyUnicode_KIND(self);
12011 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012012
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012013 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 if (length == 1)
12015 return PyBool_FromLong(
12016 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012017
12018 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012020 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 for (i = 0; i < length; i++) {
12023 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012024 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012025 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012026 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012027}
12028
INADA Naoki3ae20562017-01-16 20:41:20 +090012029/*[clinic input]
12030str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012031
INADA Naoki3ae20562017-01-16 20:41:20 +090012032Return True if the string is an alpha-numeric string, False otherwise.
12033
12034A string is alpha-numeric if all characters in the string are alpha-numeric and
12035there is at least one character in the string.
12036[clinic start generated code]*/
12037
12038static PyObject *
12039unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012040/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012041{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 int kind;
12043 void *data;
12044 Py_ssize_t len, i;
12045
12046 if (PyUnicode_READY(self) == -1)
12047 return NULL;
12048
12049 kind = PyUnicode_KIND(self);
12050 data = PyUnicode_DATA(self);
12051 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012052
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012053 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012054 if (len == 1) {
12055 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12056 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12057 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012058
12059 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012061 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 for (i = 0; i < len; i++) {
12064 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012065 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012066 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012067 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012068 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012069}
12070
INADA Naoki3ae20562017-01-16 20:41:20 +090012071/*[clinic input]
12072str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073
INADA Naoki3ae20562017-01-16 20:41:20 +090012074Return True if the string is a decimal string, False otherwise.
12075
12076A string is a decimal string if all characters in the string are decimal and
12077there is at least one character in the string.
12078[clinic start generated code]*/
12079
12080static PyObject *
12081unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012082/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 Py_ssize_t i, length;
12085 int kind;
12086 void *data;
12087
12088 if (PyUnicode_READY(self) == -1)
12089 return NULL;
12090 length = PyUnicode_GET_LENGTH(self);
12091 kind = PyUnicode_KIND(self);
12092 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 if (length == 1)
12096 return PyBool_FromLong(
12097 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012099 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012101 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 for (i = 0; i < length; i++) {
12104 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012105 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012107 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108}
12109
INADA Naoki3ae20562017-01-16 20:41:20 +090012110/*[clinic input]
12111str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
INADA Naoki3ae20562017-01-16 20:41:20 +090012113Return True if the string is a digit string, False otherwise.
12114
12115A string is a digit string if all characters in the string are digits and there
12116is at least one character in the string.
12117[clinic start generated code]*/
12118
12119static PyObject *
12120unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012121/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 Py_ssize_t i, length;
12124 int kind;
12125 void *data;
12126
12127 if (PyUnicode_READY(self) == -1)
12128 return NULL;
12129 length = PyUnicode_GET_LENGTH(self);
12130 kind = PyUnicode_KIND(self);
12131 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (length == 1) {
12135 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12136 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012139 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012141 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 for (i = 0; i < length; i++) {
12144 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012145 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012147 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148}
12149
INADA Naoki3ae20562017-01-16 20:41:20 +090012150/*[clinic input]
12151str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
INADA Naoki3ae20562017-01-16 20:41:20 +090012153Return True if the string is a numeric string, False otherwise.
12154
12155A string is numeric if all characters in the string are numeric and there is at
12156least one character in the string.
12157[clinic start generated code]*/
12158
12159static PyObject *
12160unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012161/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 Py_ssize_t i, length;
12164 int kind;
12165 void *data;
12166
12167 if (PyUnicode_READY(self) == -1)
12168 return NULL;
12169 length = PyUnicode_GET_LENGTH(self);
12170 kind = PyUnicode_KIND(self);
12171 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (length == 1)
12175 return PyBool_FromLong(
12176 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012178 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012180 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 for (i = 0; i < length; i++) {
12183 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012184 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012186 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187}
12188
Martin v. Löwis47383402007-08-15 07:32:56 +000012189int
12190PyUnicode_IsIdentifier(PyObject *self)
12191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 int kind;
12193 void *data;
12194 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012195 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (PyUnicode_READY(self) == -1) {
12198 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012199 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 }
12201
12202 /* Special case for empty strings */
12203 if (PyUnicode_GET_LENGTH(self) == 0)
12204 return 0;
12205 kind = PyUnicode_KIND(self);
12206 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012207
12208 /* PEP 3131 says that the first character must be in
12209 XID_Start and subsequent characters in XID_Continue,
12210 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012211 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012212 letters, digits, underscore). However, given the current
12213 definition of XID_Start and XID_Continue, it is sufficient
12214 to check just for these, except that _ must be allowed
12215 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012217 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012218 return 0;
12219
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012220 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012223 return 1;
12224}
12225
INADA Naoki3ae20562017-01-16 20:41:20 +090012226/*[clinic input]
12227str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012228
INADA Naoki3ae20562017-01-16 20:41:20 +090012229Return True if the string is a valid Python identifier, False otherwise.
12230
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012231Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012232such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012233[clinic start generated code]*/
12234
12235static PyObject *
12236unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012237/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012238{
12239 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12240}
12241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242/*[clinic input]
12243str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012244
INADA Naoki3ae20562017-01-16 20:41:20 +090012245Return True if the string is printable, False otherwise.
12246
12247A string is printable if all of its characters are considered printable in
12248repr() or if it is empty.
12249[clinic start generated code]*/
12250
12251static PyObject *
12252unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012253/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 Py_ssize_t i, length;
12256 int kind;
12257 void *data;
12258
12259 if (PyUnicode_READY(self) == -1)
12260 return NULL;
12261 length = PyUnicode_GET_LENGTH(self);
12262 kind = PyUnicode_KIND(self);
12263 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012264
12265 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 if (length == 1)
12267 return PyBool_FromLong(
12268 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 for (i = 0; i < length; i++) {
12271 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012272 Py_RETURN_FALSE;
12273 }
12274 }
12275 Py_RETURN_TRUE;
12276}
12277
INADA Naoki3ae20562017-01-16 20:41:20 +090012278/*[clinic input]
12279str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
INADA Naoki3ae20562017-01-16 20:41:20 +090012281 iterable: object
12282 /
12283
12284Concatenate any number of strings.
12285
Martin Panter91a88662017-01-24 00:30:06 +000012286The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012287The result is returned as a new string.
12288
12289Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12290[clinic start generated code]*/
12291
12292static PyObject *
12293unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012294/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295{
INADA Naoki3ae20562017-01-16 20:41:20 +090012296 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297}
12298
Martin v. Löwis18e16552006-02-15 17:27:45 +000012299static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012300unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 if (PyUnicode_READY(self) == -1)
12303 return -1;
12304 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305}
12306
INADA Naoki3ae20562017-01-16 20:41:20 +090012307/*[clinic input]
12308str.ljust as unicode_ljust
12309
12310 width: Py_ssize_t
12311 fillchar: Py_UCS4 = ' '
12312 /
12313
12314Return a left-justified string of length width.
12315
12316Padding is done using the specified fill character (default is a space).
12317[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318
12319static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012320unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12321/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012323 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012324 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325
Victor Stinnerc4b49542011-12-11 22:44:26 +010012326 if (PyUnicode_GET_LENGTH(self) >= width)
12327 return unicode_result_unchanged(self);
12328
12329 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330}
12331
INADA Naoki3ae20562017-01-16 20:41:20 +090012332/*[clinic input]
12333str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
INADA Naoki3ae20562017-01-16 20:41:20 +090012335Return a copy of the string converted to lowercase.
12336[clinic start generated code]*/
12337
12338static PyObject *
12339unicode_lower_impl(PyObject *self)
12340/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012342 if (PyUnicode_READY(self) == -1)
12343 return NULL;
12344 if (PyUnicode_IS_ASCII(self))
12345 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012346 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347}
12348
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012349#define LEFTSTRIP 0
12350#define RIGHTSTRIP 1
12351#define BOTHSTRIP 2
12352
12353/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012354static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012355
INADA Naoki3ae20562017-01-16 20:41:20 +090012356#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012357
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358/* externally visible for str.strip(unicode) */
12359PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012360_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 void *data;
12363 int kind;
12364 Py_ssize_t i, j, len;
12365 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012366 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12369 return NULL;
12370
12371 kind = PyUnicode_KIND(self);
12372 data = PyUnicode_DATA(self);
12373 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012374 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12376 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012377 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012378
Benjamin Peterson14339b62009-01-31 16:36:08 +000012379 i = 0;
12380 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012381 while (i < len) {
12382 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12383 if (!BLOOM(sepmask, ch))
12384 break;
12385 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12386 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 i++;
12388 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012389 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012390
Benjamin Peterson14339b62009-01-31 16:36:08 +000012391 j = len;
12392 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012393 j--;
12394 while (j >= i) {
12395 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12396 if (!BLOOM(sepmask, ch))
12397 break;
12398 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12399 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012400 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012401 }
12402
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012405
Victor Stinner7931d9a2011-11-04 00:22:48 +010012406 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407}
12408
12409PyObject*
12410PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12411{
12412 unsigned char *data;
12413 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012414 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415
Victor Stinnerde636f32011-10-01 03:55:54 +020012416 if (PyUnicode_READY(self) == -1)
12417 return NULL;
12418
Victor Stinner684d5fd2012-05-03 02:32:34 +020012419 length = PyUnicode_GET_LENGTH(self);
12420 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012421
Victor Stinner684d5fd2012-05-03 02:32:34 +020012422 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012423 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424
Victor Stinnerde636f32011-10-01 03:55:54 +020012425 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012426 PyErr_SetString(PyExc_IndexError, "string index out of range");
12427 return NULL;
12428 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012429 if (start >= length || end < start)
12430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012431
Victor Stinner684d5fd2012-05-03 02:32:34 +020012432 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012433 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012434 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012435 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012436 }
12437 else {
12438 kind = PyUnicode_KIND(self);
12439 data = PyUnicode_1BYTE_DATA(self);
12440 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012441 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012442 length);
12443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
12446static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012447do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012449 Py_ssize_t len, i, j;
12450
12451 if (PyUnicode_READY(self) == -1)
12452 return NULL;
12453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012455
Victor Stinnercc7af722013-04-09 22:39:24 +020012456 if (PyUnicode_IS_ASCII(self)) {
12457 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12458
12459 i = 0;
12460 if (striptype != RIGHTSTRIP) {
12461 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012462 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012463 if (!_Py_ascii_whitespace[ch])
12464 break;
12465 i++;
12466 }
12467 }
12468
12469 j = len;
12470 if (striptype != LEFTSTRIP) {
12471 j--;
12472 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012473 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012474 if (!_Py_ascii_whitespace[ch])
12475 break;
12476 j--;
12477 }
12478 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012479 }
12480 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012481 else {
12482 int kind = PyUnicode_KIND(self);
12483 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012484
Victor Stinnercc7af722013-04-09 22:39:24 +020012485 i = 0;
12486 if (striptype != RIGHTSTRIP) {
12487 while (i < len) {
12488 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12489 if (!Py_UNICODE_ISSPACE(ch))
12490 break;
12491 i++;
12492 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012493 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012494
12495 j = len;
12496 if (striptype != LEFTSTRIP) {
12497 j--;
12498 while (j >= i) {
12499 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12500 if (!Py_UNICODE_ISSPACE(ch))
12501 break;
12502 j--;
12503 }
12504 j++;
12505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012506 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012507
Victor Stinner7931d9a2011-11-04 00:22:48 +010012508 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509}
12510
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511
12512static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012513do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012514{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012515 if (sep != NULL && sep != Py_None) {
12516 if (PyUnicode_Check(sep))
12517 return _PyUnicode_XStrip(self, striptype, sep);
12518 else {
12519 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012520 "%s arg must be None or str",
12521 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012522 return NULL;
12523 }
12524 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012525
Benjamin Peterson14339b62009-01-31 16:36:08 +000012526 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012527}
12528
12529
INADA Naoki3ae20562017-01-16 20:41:20 +090012530/*[clinic input]
12531str.strip as unicode_strip
12532
12533 chars: object = None
12534 /
12535
Victor Stinner0c4a8282017-01-17 02:21:47 +010012536Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012537
12538If chars is given and not None, remove characters in chars instead.
12539[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012540
12541static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012542unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012543/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012544{
INADA Naoki3ae20562017-01-16 20:41:20 +090012545 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012546}
12547
12548
INADA Naoki3ae20562017-01-16 20:41:20 +090012549/*[clinic input]
12550str.lstrip as unicode_lstrip
12551
12552 chars: object = NULL
12553 /
12554
12555Return a copy of the string with leading whitespace removed.
12556
12557If chars is given and not None, remove characters in chars instead.
12558[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012559
12560static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012561unicode_lstrip_impl(PyObject *self, PyObject *chars)
12562/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012563{
INADA Naoki3ae20562017-01-16 20:41:20 +090012564 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012565}
12566
12567
INADA Naoki3ae20562017-01-16 20:41:20 +090012568/*[clinic input]
12569str.rstrip as unicode_rstrip
12570
12571 chars: object = NULL
12572 /
12573
12574Return a copy of the string with trailing whitespace removed.
12575
12576If chars is given and not None, remove characters in chars instead.
12577[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012578
12579static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012580unicode_rstrip_impl(PyObject *self, PyObject *chars)
12581/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012582{
INADA Naoki3ae20562017-01-16 20:41:20 +090012583 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012584}
12585
12586
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012588unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012590 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012591 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592
Serhiy Storchaka05997252013-01-26 12:14:02 +020012593 if (len < 1)
12594 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595
Victor Stinnerc4b49542011-12-11 22:44:26 +010012596 /* no repeat, return original string */
12597 if (len == 1)
12598 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012599
Benjamin Petersonbac79492012-01-14 13:34:47 -050012600 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 return NULL;
12602
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012603 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012604 PyErr_SetString(PyExc_OverflowError,
12605 "repeated string is too long");
12606 return NULL;
12607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012609
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012610 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611 if (!u)
12612 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012613 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 if (PyUnicode_GET_LENGTH(str) == 1) {
12616 const int kind = PyUnicode_KIND(str);
12617 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012618 if (kind == PyUnicode_1BYTE_KIND) {
12619 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012620 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012621 }
12622 else if (kind == PyUnicode_2BYTE_KIND) {
12623 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012624 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012625 ucs2[n] = fill_char;
12626 } else {
12627 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12628 assert(kind == PyUnicode_4BYTE_KIND);
12629 for (n = 0; n < len; ++n)
12630 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 }
12633 else {
12634 /* number of characters copied this far */
12635 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012636 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012638 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012642 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645 }
12646
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012647 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012648 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649}
12650
Alexander Belopolsky40018472011-02-26 01:02:56 +000012651PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012652PyUnicode_Replace(PyObject *str,
12653 PyObject *substr,
12654 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012655 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012657 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12658 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012660 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661}
12662
INADA Naoki3ae20562017-01-16 20:41:20 +090012663/*[clinic input]
12664str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665
INADA Naoki3ae20562017-01-16 20:41:20 +090012666 old: unicode
12667 new: unicode
12668 count: Py_ssize_t = -1
12669 Maximum number of occurrences to replace.
12670 -1 (the default value) means replace all occurrences.
12671 /
12672
12673Return a copy with all occurrences of substring old replaced by new.
12674
12675If the optional argument count is given, only the first count occurrences are
12676replaced.
12677[clinic start generated code]*/
12678
12679static PyObject *
12680unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12681 Py_ssize_t count)
12682/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012684 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012686 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687}
12688
Alexander Belopolsky40018472011-02-26 01:02:56 +000012689static PyObject *
12690unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012692 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693 Py_ssize_t isize;
12694 Py_ssize_t osize, squote, dquote, i, o;
12695 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012696 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012700 return NULL;
12701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 isize = PyUnicode_GET_LENGTH(unicode);
12703 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 /* Compute length of output, quote characters, and
12706 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012707 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 max = 127;
12709 squote = dquote = 0;
12710 ikind = PyUnicode_KIND(unicode);
12711 for (i = 0; i < isize; i++) {
12712 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012713 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012715 case '\'': squote++; break;
12716 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012718 incr = 2;
12719 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 default:
12721 /* Fast-path ASCII */
12722 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012723 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012725 ;
12726 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012729 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012731 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012733 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012735 if (osize > PY_SSIZE_T_MAX - incr) {
12736 PyErr_SetString(PyExc_OverflowError,
12737 "string is too long to generate repr");
12738 return NULL;
12739 }
12740 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 }
12742
12743 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012744 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012746 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 if (dquote)
12748 /* Both squote and dquote present. Use squote,
12749 and escape them */
12750 osize += squote;
12751 else
12752 quote = '"';
12753 }
Victor Stinner55c08782013-04-14 18:45:39 +020012754 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755
12756 repr = PyUnicode_New(osize, max);
12757 if (repr == NULL)
12758 return NULL;
12759 okind = PyUnicode_KIND(repr);
12760 odata = PyUnicode_DATA(repr);
12761
12762 PyUnicode_WRITE(okind, odata, 0, quote);
12763 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012764 if (unchanged) {
12765 _PyUnicode_FastCopyCharacters(repr, 1,
12766 unicode, 0,
12767 isize);
12768 }
12769 else {
12770 for (i = 0, o = 1; i < isize; i++) {
12771 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772
Victor Stinner55c08782013-04-14 18:45:39 +020012773 /* Escape quotes and backslashes */
12774 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012775 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012777 continue;
12778 }
12779
12780 /* Map special whitespace to '\t', \n', '\r' */
12781 if (ch == '\t') {
12782 PyUnicode_WRITE(okind, odata, o++, '\\');
12783 PyUnicode_WRITE(okind, odata, o++, 't');
12784 }
12785 else if (ch == '\n') {
12786 PyUnicode_WRITE(okind, odata, o++, '\\');
12787 PyUnicode_WRITE(okind, odata, o++, 'n');
12788 }
12789 else if (ch == '\r') {
12790 PyUnicode_WRITE(okind, odata, o++, '\\');
12791 PyUnicode_WRITE(okind, odata, o++, 'r');
12792 }
12793
12794 /* Map non-printable US ASCII to '\xhh' */
12795 else if (ch < ' ' || ch == 0x7F) {
12796 PyUnicode_WRITE(okind, odata, o++, '\\');
12797 PyUnicode_WRITE(okind, odata, o++, 'x');
12798 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12799 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12800 }
12801
12802 /* Copy ASCII characters as-is */
12803 else if (ch < 0x7F) {
12804 PyUnicode_WRITE(okind, odata, o++, ch);
12805 }
12806
12807 /* Non-ASCII characters */
12808 else {
12809 /* Map Unicode whitespace and control characters
12810 (categories Z* and C* except ASCII space)
12811 */
12812 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12813 PyUnicode_WRITE(okind, odata, o++, '\\');
12814 /* Map 8-bit characters to '\xhh' */
12815 if (ch <= 0xff) {
12816 PyUnicode_WRITE(okind, odata, o++, 'x');
12817 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12818 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12819 }
12820 /* Map 16-bit characters to '\uxxxx' */
12821 else if (ch <= 0xffff) {
12822 PyUnicode_WRITE(okind, odata, o++, 'u');
12823 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12824 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12825 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12826 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12827 }
12828 /* Map 21-bit characters to '\U00xxxxxx' */
12829 else {
12830 PyUnicode_WRITE(okind, odata, o++, 'U');
12831 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12832 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12833 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12834 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12836 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12837 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12838 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12839 }
12840 }
12841 /* Copy characters as-is */
12842 else {
12843 PyUnicode_WRITE(okind, odata, o++, ch);
12844 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012845 }
12846 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012849 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012850 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851}
12852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012853PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855\n\
12856Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012857such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858arguments start and end are interpreted as in slice notation.\n\
12859\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012860Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861
12862static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012865 /* initialize variables to prevent gcc warning */
12866 PyObject *substring = NULL;
12867 Py_ssize_t start = 0;
12868 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012869 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012871 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012874 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012877 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 if (result == -2)
12880 return NULL;
12881
Christian Heimes217cfd12007-12-02 14:31:20 +000012882 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883}
12884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012885PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012888Return the highest index in S where substring sub is found,\n\
12889such that sub is contained within S[start:end]. Optional\n\
12890arguments start and end are interpreted as in slice notation.\n\
12891\n\
12892Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
12894static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012897 /* initialize variables to prevent gcc warning */
12898 PyObject *substring = NULL;
12899 Py_ssize_t start = 0;
12900 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012901 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012903 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012904 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012906 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012909 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 if (result == -2)
12912 return NULL;
12913
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914 if (result < 0) {
12915 PyErr_SetString(PyExc_ValueError, "substring not found");
12916 return NULL;
12917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918
Christian Heimes217cfd12007-12-02 14:31:20 +000012919 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920}
12921
INADA Naoki3ae20562017-01-16 20:41:20 +090012922/*[clinic input]
12923str.rjust as unicode_rjust
12924
12925 width: Py_ssize_t
12926 fillchar: Py_UCS4 = ' '
12927 /
12928
12929Return a right-justified string of length width.
12930
12931Padding is done using the specified fill character (default is a space).
12932[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933
12934static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012935unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12936/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012938 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939 return NULL;
12940
Victor Stinnerc4b49542011-12-11 22:44:26 +010012941 if (PyUnicode_GET_LENGTH(self) >= width)
12942 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943
Victor Stinnerc4b49542011-12-11 22:44:26 +010012944 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945}
12946
Alexander Belopolsky40018472011-02-26 01:02:56 +000012947PyObject *
12948PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012950 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012953 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954}
12955
INADA Naoki3ae20562017-01-16 20:41:20 +090012956/*[clinic input]
12957str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
INADA Naoki3ae20562017-01-16 20:41:20 +090012959 sep: object = None
12960 The delimiter according which to split the string.
12961 None (the default value) means split according to any whitespace,
12962 and discard empty strings from the result.
12963 maxsplit: Py_ssize_t = -1
12964 Maximum number of splits to do.
12965 -1 (the default value) means no limit.
12966
12967Return a list of the words in the string, using sep as the delimiter string.
12968[clinic start generated code]*/
12969
12970static PyObject *
12971unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12972/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973{
INADA Naoki3ae20562017-01-16 20:41:20 +090012974 if (sep == Py_None)
12975 return split(self, NULL, maxsplit);
12976 if (PyUnicode_Check(sep))
12977 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012978
Victor Stinner998b8062018-09-12 00:23:25 +020012979 PyErr_Format(PyExc_TypeError,
12980 "must be str or None, not %.100s",
12981 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983}
12984
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 int kind1, kind2;
12990 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012993 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995
Victor Stinner14f8f022011-10-05 20:58:25 +020012996 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 len1 = PyUnicode_GET_LENGTH(str_obj);
12999 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 if (kind1 < kind2 || len1 < len2) {
13001 _Py_INCREF_UNICODE_EMPTY();
13002 if (!unicode_empty)
13003 out = NULL;
13004 else {
13005 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13006 Py_DECREF(unicode_empty);
13007 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 return out;
13009 }
13010 buf1 = PyUnicode_DATA(str_obj);
13011 buf2 = PyUnicode_DATA(sep_obj);
13012 if (kind2 != kind1) {
13013 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13014 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013015 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013018 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013020 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13021 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 else
13023 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 break;
13025 case PyUnicode_2BYTE_KIND:
13026 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027 break;
13028 case PyUnicode_4BYTE_KIND:
13029 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030 break;
13031 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013032 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013034
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013035 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037
13038 return out;
13039}
13040
13041
13042PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013043PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013044{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013045 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013046 int kind1, kind2;
13047 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013050 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013053 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 len1 = PyUnicode_GET_LENGTH(str_obj);
13056 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013057 if (kind1 < kind2 || len1 < len2) {
13058 _Py_INCREF_UNICODE_EMPTY();
13059 if (!unicode_empty)
13060 out = NULL;
13061 else {
13062 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13063 Py_DECREF(unicode_empty);
13064 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013065 return out;
13066 }
13067 buf1 = PyUnicode_DATA(str_obj);
13068 buf2 = PyUnicode_DATA(sep_obj);
13069 if (kind2 != kind1) {
13070 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13071 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013072 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013074
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013075 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013077 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13078 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13079 else
13080 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 break;
13082 case PyUnicode_2BYTE_KIND:
13083 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13084 break;
13085 case PyUnicode_4BYTE_KIND:
13086 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13087 break;
13088 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013089 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013091
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013092 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013094
13095 return out;
13096}
13097
INADA Naoki3ae20562017-01-16 20:41:20 +090013098/*[clinic input]
13099str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013100
INADA Naoki3ae20562017-01-16 20:41:20 +090013101 sep: object
13102 /
13103
13104Partition the string into three parts using the given separator.
13105
13106This will search for the separator in the string. If the separator is found,
13107returns a 3-tuple containing the part before the separator, the separator
13108itself, and the part after it.
13109
13110If the separator is not found, returns a 3-tuple containing the original string
13111and two empty strings.
13112[clinic start generated code]*/
13113
13114static PyObject *
13115unicode_partition(PyObject *self, PyObject *sep)
13116/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013117{
INADA Naoki3ae20562017-01-16 20:41:20 +090013118 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013119}
13120
INADA Naoki3ae20562017-01-16 20:41:20 +090013121/*[clinic input]
13122str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013123
INADA Naoki3ae20562017-01-16 20:41:20 +090013124Partition the string into three parts using the given separator.
13125
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013126This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013127the separator is found, returns a 3-tuple containing the part before the
13128separator, the separator itself, and the part after it.
13129
13130If the separator is not found, returns a 3-tuple containing two empty strings
13131and the original string.
13132[clinic start generated code]*/
13133
13134static PyObject *
13135unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013136/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137{
INADA Naoki3ae20562017-01-16 20:41:20 +090013138 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013139}
13140
Alexander Belopolsky40018472011-02-26 01:02:56 +000013141PyObject *
13142PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013143{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013144 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013145 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013146
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013147 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013148}
13149
INADA Naoki3ae20562017-01-16 20:41:20 +090013150/*[clinic input]
13151str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013152
INADA Naoki3ae20562017-01-16 20:41:20 +090013153Return a list of the words in the string, using sep as the delimiter string.
13154
13155Splits are done starting at the end of the string and working to the front.
13156[clinic start generated code]*/
13157
13158static PyObject *
13159unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13160/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013161{
INADA Naoki3ae20562017-01-16 20:41:20 +090013162 if (sep == Py_None)
13163 return rsplit(self, NULL, maxsplit);
13164 if (PyUnicode_Check(sep))
13165 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013166
Victor Stinner998b8062018-09-12 00:23:25 +020013167 PyErr_Format(PyExc_TypeError,
13168 "must be str or None, not %.100s",
13169 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013170 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013171}
13172
INADA Naoki3ae20562017-01-16 20:41:20 +090013173/*[clinic input]
13174str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013176 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013177
13178Return a list of the lines in the string, breaking at line boundaries.
13179
13180Line breaks are not included in the resulting list unless keepends is given and
13181true.
13182[clinic start generated code]*/
13183
13184static PyObject *
13185unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013186/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013188 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189}
13190
13191static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013192PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013194 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013195}
13196
INADA Naoki3ae20562017-01-16 20:41:20 +090013197/*[clinic input]
13198str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199
INADA Naoki3ae20562017-01-16 20:41:20 +090013200Convert uppercase characters to lowercase and lowercase characters to uppercase.
13201[clinic start generated code]*/
13202
13203static PyObject *
13204unicode_swapcase_impl(PyObject *self)
13205/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013207 if (PyUnicode_READY(self) == -1)
13208 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013209 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210}
13211
Larry Hastings61272b72014-01-07 12:41:53 -080013212/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013213
Larry Hastings31826802013-10-19 00:09:25 -070013214@staticmethod
13215str.maketrans as unicode_maketrans
13216
13217 x: object
13218
13219 y: unicode=NULL
13220
13221 z: unicode=NULL
13222
13223 /
13224
13225Return a translation table usable for str.translate().
13226
13227If there is only one argument, it must be a dictionary mapping Unicode
13228ordinals (integers) or characters to Unicode ordinals, strings or None.
13229Character keys will be then converted to ordinals.
13230If there are two arguments, they must be strings of equal length, and
13231in the resulting dictionary, each character in x will be mapped to the
13232character at the same position in y. If there is a third argument, it
13233must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013234[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013235
Larry Hastings31826802013-10-19 00:09:25 -070013236static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013237unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013238/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013239{
Georg Brandlceee0772007-11-27 23:48:05 +000013240 PyObject *new = NULL, *key, *value;
13241 Py_ssize_t i = 0;
13242 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013243
Georg Brandlceee0772007-11-27 23:48:05 +000013244 new = PyDict_New();
13245 if (!new)
13246 return NULL;
13247 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 int x_kind, y_kind, z_kind;
13249 void *x_data, *y_data, *z_data;
13250
Georg Brandlceee0772007-11-27 23:48:05 +000013251 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013252 if (!PyUnicode_Check(x)) {
13253 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13254 "be a string if there is a second argument");
13255 goto err;
13256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013258 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13259 "arguments must have equal length");
13260 goto err;
13261 }
13262 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 x_kind = PyUnicode_KIND(x);
13264 y_kind = PyUnicode_KIND(y);
13265 x_data = PyUnicode_DATA(x);
13266 y_data = PyUnicode_DATA(y);
13267 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13268 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013269 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013270 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013271 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013272 if (!value) {
13273 Py_DECREF(key);
13274 goto err;
13275 }
Georg Brandlceee0772007-11-27 23:48:05 +000013276 res = PyDict_SetItem(new, key, value);
13277 Py_DECREF(key);
13278 Py_DECREF(value);
13279 if (res < 0)
13280 goto err;
13281 }
13282 /* create entries for deleting chars in z */
13283 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013284 z_kind = PyUnicode_KIND(z);
13285 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013286 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013288 if (!key)
13289 goto err;
13290 res = PyDict_SetItem(new, key, Py_None);
13291 Py_DECREF(key);
13292 if (res < 0)
13293 goto err;
13294 }
13295 }
13296 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 int kind;
13298 void *data;
13299
Georg Brandlceee0772007-11-27 23:48:05 +000013300 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013301 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013302 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13303 "to maketrans it must be a dict");
13304 goto err;
13305 }
13306 /* copy entries into the new dict, converting string keys to int keys */
13307 while (PyDict_Next(x, &i, &key, &value)) {
13308 if (PyUnicode_Check(key)) {
13309 /* convert string keys to integer keys */
13310 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013311 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013312 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13313 "table must be of length 1");
13314 goto err;
13315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 kind = PyUnicode_KIND(key);
13317 data = PyUnicode_DATA(key);
13318 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013319 if (!newkey)
13320 goto err;
13321 res = PyDict_SetItem(new, newkey, value);
13322 Py_DECREF(newkey);
13323 if (res < 0)
13324 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013325 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013326 /* just keep integer keys */
13327 if (PyDict_SetItem(new, key, value) < 0)
13328 goto err;
13329 } else {
13330 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13331 "be strings or integers");
13332 goto err;
13333 }
13334 }
13335 }
13336 return new;
13337 err:
13338 Py_DECREF(new);
13339 return NULL;
13340}
13341
INADA Naoki3ae20562017-01-16 20:41:20 +090013342/*[clinic input]
13343str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
INADA Naoki3ae20562017-01-16 20:41:20 +090013345 table: object
13346 Translation table, which must be a mapping of Unicode ordinals to
13347 Unicode ordinals, strings, or None.
13348 /
13349
13350Replace each character in the string using the given translation table.
13351
13352The table must implement lookup/indexing via __getitem__, for instance a
13353dictionary or list. If this operation raises LookupError, the character is
13354left untouched. Characters mapped to None are deleted.
13355[clinic start generated code]*/
13356
13357static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013359/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362}
13363
INADA Naoki3ae20562017-01-16 20:41:20 +090013364/*[clinic input]
13365str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366
INADA Naoki3ae20562017-01-16 20:41:20 +090013367Return a copy of the string converted to uppercase.
13368[clinic start generated code]*/
13369
13370static PyObject *
13371unicode_upper_impl(PyObject *self)
13372/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013374 if (PyUnicode_READY(self) == -1)
13375 return NULL;
13376 if (PyUnicode_IS_ASCII(self))
13377 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013378 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379}
13380
INADA Naoki3ae20562017-01-16 20:41:20 +090013381/*[clinic input]
13382str.zfill as unicode_zfill
13383
13384 width: Py_ssize_t
13385 /
13386
13387Pad a numeric string with zeros on the left, to fill a field of the given width.
13388
13389The string is never truncated.
13390[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391
13392static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013393unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013394/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013396 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013397 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398 int kind;
13399 void *data;
13400 Py_UCS4 chr;
13401
Benjamin Petersonbac79492012-01-14 13:34:47 -050013402 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404
Victor Stinnerc4b49542011-12-11 22:44:26 +010013405 if (PyUnicode_GET_LENGTH(self) >= width)
13406 return unicode_result_unchanged(self);
13407
13408 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409
13410 u = pad(self, fill, 0, '0');
13411
Walter Dörwald068325e2002-04-15 13:36:47 +000013412 if (u == NULL)
13413 return NULL;
13414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013415 kind = PyUnicode_KIND(u);
13416 data = PyUnicode_DATA(u);
13417 chr = PyUnicode_READ(kind, data, fill);
13418
13419 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 PyUnicode_WRITE(kind, data, 0, chr);
13422 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 }
13424
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013425 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013426 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428
13429#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013430static PyObject *
13431unicode__decimal2ascii(PyObject *self)
13432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013434}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435#endif
13436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013437PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013440Return True if S starts with the specified prefix, False otherwise.\n\
13441With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013442With optional end, stop comparing S at that position.\n\
13443prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444
13445static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013446unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013449 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013450 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013451 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013452 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013453 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454
Jesus Ceaac451502011-04-20 17:09:23 +020013455 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013457 if (PyTuple_Check(subobj)) {
13458 Py_ssize_t i;
13459 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013460 substring = PyTuple_GET_ITEM(subobj, i);
13461 if (!PyUnicode_Check(substring)) {
13462 PyErr_Format(PyExc_TypeError,
13463 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013464 "not %.100s",
13465 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013466 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013467 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013468 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013469 if (result == -1)
13470 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013471 if (result) {
13472 Py_RETURN_TRUE;
13473 }
13474 }
13475 /* nothing matched */
13476 Py_RETURN_FALSE;
13477 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013478 if (!PyUnicode_Check(subobj)) {
13479 PyErr_Format(PyExc_TypeError,
13480 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013481 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013483 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013484 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013485 if (result == -1)
13486 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013487 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013488}
13489
13490
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013491PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013493\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013494Return True if S ends with the specified suffix, False otherwise.\n\
13495With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013496With optional end, stop comparing S at that position.\n\
13497suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013498
13499static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013500unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013503 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013504 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013505 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013506 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013507 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508
Jesus Ceaac451502011-04-20 17:09:23 +020013509 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013511 if (PyTuple_Check(subobj)) {
13512 Py_ssize_t i;
13513 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013514 substring = PyTuple_GET_ITEM(subobj, i);
13515 if (!PyUnicode_Check(substring)) {
13516 PyErr_Format(PyExc_TypeError,
13517 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013518 "not %.100s",
13519 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013521 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013522 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013523 if (result == -1)
13524 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013525 if (result) {
13526 Py_RETURN_TRUE;
13527 }
13528 }
13529 Py_RETURN_FALSE;
13530 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013531 if (!PyUnicode_Check(subobj)) {
13532 PyErr_Format(PyExc_TypeError,
13533 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013534 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013535 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013536 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013537 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013538 if (result == -1)
13539 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013540 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541}
13542
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013543static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013544_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013545{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013546 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13547 writer->data = PyUnicode_DATA(writer->buffer);
13548
13549 if (!writer->readonly) {
13550 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013551 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013552 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013553 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013554 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13555 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13556 writer->kind = PyUnicode_WCHAR_KIND;
13557 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13558
Victor Stinner8f674cc2013-04-17 23:02:17 +020013559 /* Copy-on-write mode: set buffer size to 0 so
13560 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13561 * next write. */
13562 writer->size = 0;
13563 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013564}
13565
Victor Stinnerd3f08822012-05-29 12:57:52 +020013566void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013567_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013568{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013570
13571 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013572 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013573
13574 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13575 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13576 writer->kind = PyUnicode_WCHAR_KIND;
13577 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013578}
13579
Inada Naoki770847a2019-06-24 12:30:24 +090013580// Initialize _PyUnicodeWriter with initial buffer
13581static inline void
13582_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13583{
13584 memset(writer, 0, sizeof(*writer));
13585 writer->buffer = buffer;
13586 _PyUnicodeWriter_Update(writer);
13587 writer->min_length = writer->size;
13588}
13589
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590int
13591_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13592 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013593{
13594 Py_ssize_t newlen;
13595 PyObject *newbuffer;
13596
Victor Stinner2740e462016-09-06 16:58:36 -070013597 assert(maxchar <= MAX_UNICODE);
13598
Victor Stinnerca9381e2015-09-22 00:58:32 +020013599 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013600 assert((maxchar > writer->maxchar && length >= 0)
13601 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013602
Victor Stinner202fdca2012-05-07 12:47:02 +020013603 if (length > PY_SSIZE_T_MAX - writer->pos) {
13604 PyErr_NoMemory();
13605 return -1;
13606 }
13607 newlen = writer->pos + length;
13608
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013609 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013610
Victor Stinnerd3f08822012-05-29 12:57:52 +020013611 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013612 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013613 if (writer->overallocate
13614 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13615 /* overallocate to limit the number of realloc() */
13616 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013617 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013618 if (newlen < writer->min_length)
13619 newlen = writer->min_length;
13620
Victor Stinnerd3f08822012-05-29 12:57:52 +020013621 writer->buffer = PyUnicode_New(newlen, maxchar);
13622 if (writer->buffer == NULL)
13623 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013624 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013625 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013626 if (writer->overallocate
13627 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13628 /* overallocate to limit the number of realloc() */
13629 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013630 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013631 if (newlen < writer->min_length)
13632 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013634 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013635 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013636 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013637 newbuffer = PyUnicode_New(newlen, maxchar);
13638 if (newbuffer == NULL)
13639 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13641 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013642 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013643 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013644 }
13645 else {
13646 newbuffer = resize_compact(writer->buffer, newlen);
13647 if (newbuffer == NULL)
13648 return -1;
13649 }
13650 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013651 }
13652 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013653 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013654 newbuffer = PyUnicode_New(writer->size, maxchar);
13655 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013656 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013657 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13658 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013659 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013660 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013661 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013662 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013663
13664#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013665}
13666
Victor Stinnerca9381e2015-09-22 00:58:32 +020013667int
13668_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13669 enum PyUnicode_Kind kind)
13670{
13671 Py_UCS4 maxchar;
13672
13673 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13674 assert(writer->kind < kind);
13675
13676 switch (kind)
13677 {
13678 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13679 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13680 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13681 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013682 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013683 }
13684
13685 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13686}
13687
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013688static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013689_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013690{
Victor Stinner2740e462016-09-06 16:58:36 -070013691 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013692 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13693 return -1;
13694 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13695 writer->pos++;
13696 return 0;
13697}
13698
13699int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013700_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13701{
13702 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13703}
13704
13705int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013706_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13707{
13708 Py_UCS4 maxchar;
13709 Py_ssize_t len;
13710
13711 if (PyUnicode_READY(str) == -1)
13712 return -1;
13713 len = PyUnicode_GET_LENGTH(str);
13714 if (len == 0)
13715 return 0;
13716 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13717 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013718 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013719 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013720 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013721 Py_INCREF(str);
13722 writer->buffer = str;
13723 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013724 writer->pos += len;
13725 return 0;
13726 }
13727 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13728 return -1;
13729 }
13730 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13731 str, 0, len);
13732 writer->pos += len;
13733 return 0;
13734}
13735
Victor Stinnere215d962012-10-06 23:03:36 +020013736int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013737_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13738 Py_ssize_t start, Py_ssize_t end)
13739{
13740 Py_UCS4 maxchar;
13741 Py_ssize_t len;
13742
13743 if (PyUnicode_READY(str) == -1)
13744 return -1;
13745
13746 assert(0 <= start);
13747 assert(end <= PyUnicode_GET_LENGTH(str));
13748 assert(start <= end);
13749
13750 if (end == 0)
13751 return 0;
13752
13753 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13754 return _PyUnicodeWriter_WriteStr(writer, str);
13755
13756 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13757 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13758 else
13759 maxchar = writer->maxchar;
13760 len = end - start;
13761
13762 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13763 return -1;
13764
13765 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13766 str, start, len);
13767 writer->pos += len;
13768 return 0;
13769}
13770
13771int
Victor Stinner4a587072013-11-19 12:54:53 +010013772_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13773 const char *ascii, Py_ssize_t len)
13774{
13775 if (len == -1)
13776 len = strlen(ascii);
13777
13778 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13779
13780 if (writer->buffer == NULL && !writer->overallocate) {
13781 PyObject *str;
13782
13783 str = _PyUnicode_FromASCII(ascii, len);
13784 if (str == NULL)
13785 return -1;
13786
13787 writer->readonly = 1;
13788 writer->buffer = str;
13789 _PyUnicodeWriter_Update(writer);
13790 writer->pos += len;
13791 return 0;
13792 }
13793
13794 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13795 return -1;
13796
13797 switch (writer->kind)
13798 {
13799 case PyUnicode_1BYTE_KIND:
13800 {
13801 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13802 Py_UCS1 *data = writer->data;
13803
Christian Heimesf051e432016-09-13 20:22:02 +020013804 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013805 break;
13806 }
13807 case PyUnicode_2BYTE_KIND:
13808 {
13809 _PyUnicode_CONVERT_BYTES(
13810 Py_UCS1, Py_UCS2,
13811 ascii, ascii + len,
13812 (Py_UCS2 *)writer->data + writer->pos);
13813 break;
13814 }
13815 case PyUnicode_4BYTE_KIND:
13816 {
13817 _PyUnicode_CONVERT_BYTES(
13818 Py_UCS1, Py_UCS4,
13819 ascii, ascii + len,
13820 (Py_UCS4 *)writer->data + writer->pos);
13821 break;
13822 }
13823 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013824 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013825 }
13826
13827 writer->pos += len;
13828 return 0;
13829}
13830
13831int
13832_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13833 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013834{
13835 Py_UCS4 maxchar;
13836
13837 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13838 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13839 return -1;
13840 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13841 writer->pos += len;
13842 return 0;
13843}
13844
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013846_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013847{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013848 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013849
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013851 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013852 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013853 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013854
13855 str = writer->buffer;
13856 writer->buffer = NULL;
13857
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013858 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013859 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13860 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013862
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013863 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13864 PyObject *str2;
13865 str2 = resize_compact(str, writer->pos);
13866 if (str2 == NULL) {
13867 Py_DECREF(str);
13868 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013869 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013870 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013871 }
13872
Victor Stinner15a0bd32013-07-08 22:29:55 +020013873 assert(_PyUnicode_CheckConsistency(str, 1));
13874 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013875}
13876
Victor Stinnerd3f08822012-05-29 12:57:52 +020013877void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013878_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013879{
13880 Py_CLEAR(writer->buffer);
13881}
13882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013883#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013884
13885PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013886 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013887\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013888Return a formatted version of S, using substitutions from args and kwargs.\n\
13889The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013890
Eric Smith27bbca62010-11-04 17:06:58 +000013891PyDoc_STRVAR(format_map__doc__,
13892 "S.format_map(mapping) -> str\n\
13893\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013894Return a formatted version of S, using substitutions from mapping.\n\
13895The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013896
INADA Naoki3ae20562017-01-16 20:41:20 +090013897/*[clinic input]
13898str.__format__ as unicode___format__
13899
13900 format_spec: unicode
13901 /
13902
13903Return a formatted version of the string as described by format_spec.
13904[clinic start generated code]*/
13905
Eric Smith4a7d76d2008-05-30 18:10:19 +000013906static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013907unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013908/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013909{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013910 _PyUnicodeWriter writer;
13911 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013912
Victor Stinnerd3f08822012-05-29 12:57:52 +020013913 if (PyUnicode_READY(self) == -1)
13914 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013915 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013916 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13917 self, format_spec, 0,
13918 PyUnicode_GET_LENGTH(format_spec));
13919 if (ret == -1) {
13920 _PyUnicodeWriter_Dealloc(&writer);
13921 return NULL;
13922 }
13923 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013924}
13925
INADA Naoki3ae20562017-01-16 20:41:20 +090013926/*[clinic input]
13927str.__sizeof__ as unicode_sizeof
13928
13929Return the size of the string in memory, in bytes.
13930[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013931
13932static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013933unicode_sizeof_impl(PyObject *self)
13934/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013936 Py_ssize_t size;
13937
13938 /* If it's a compact object, account for base structure +
13939 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013940 if (PyUnicode_IS_COMPACT_ASCII(self))
13941 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13942 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013943 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013944 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 else {
13946 /* If it is a two-block object, account for base object, and
13947 for character block if present. */
13948 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013949 if (_PyUnicode_DATA_ANY(self))
13950 size += (PyUnicode_GET_LENGTH(self) + 1) *
13951 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013952 }
13953 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013954 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013955 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13956 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13957 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13958 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013959
13960 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013961}
13962
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013963static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013964unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013965{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013966 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013967 if (!copy)
13968 return NULL;
13969 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013970}
13971
Guido van Rossumd57fd912000-03-10 22:53:23 +000013972static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013973 UNICODE_ENCODE_METHODDEF
13974 UNICODE_REPLACE_METHODDEF
13975 UNICODE_SPLIT_METHODDEF
13976 UNICODE_RSPLIT_METHODDEF
13977 UNICODE_JOIN_METHODDEF
13978 UNICODE_CAPITALIZE_METHODDEF
13979 UNICODE_CASEFOLD_METHODDEF
13980 UNICODE_TITLE_METHODDEF
13981 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013982 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013983 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013984 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013985 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013986 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013987 UNICODE_LJUST_METHODDEF
13988 UNICODE_LOWER_METHODDEF
13989 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013990 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13991 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013992 UNICODE_RJUST_METHODDEF
13993 UNICODE_RSTRIP_METHODDEF
13994 UNICODE_RPARTITION_METHODDEF
13995 UNICODE_SPLITLINES_METHODDEF
13996 UNICODE_STRIP_METHODDEF
13997 UNICODE_SWAPCASE_METHODDEF
13998 UNICODE_TRANSLATE_METHODDEF
13999 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014000 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14001 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014002 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014003 UNICODE_ISLOWER_METHODDEF
14004 UNICODE_ISUPPER_METHODDEF
14005 UNICODE_ISTITLE_METHODDEF
14006 UNICODE_ISSPACE_METHODDEF
14007 UNICODE_ISDECIMAL_METHODDEF
14008 UNICODE_ISDIGIT_METHODDEF
14009 UNICODE_ISNUMERIC_METHODDEF
14010 UNICODE_ISALPHA_METHODDEF
14011 UNICODE_ISALNUM_METHODDEF
14012 UNICODE_ISIDENTIFIER_METHODDEF
14013 UNICODE_ISPRINTABLE_METHODDEF
14014 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014015 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014016 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014017 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014018 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014019 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014020#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014021 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014022 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014023#endif
14024
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014025 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014026 {NULL, NULL}
14027};
14028
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014029static PyObject *
14030unicode_mod(PyObject *v, PyObject *w)
14031{
Brian Curtindfc80e32011-08-10 20:28:54 -050014032 if (!PyUnicode_Check(v))
14033 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014034 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014035}
14036
14037static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 0, /*nb_add*/
14039 0, /*nb_subtract*/
14040 0, /*nb_multiply*/
14041 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014042};
14043
Guido van Rossumd57fd912000-03-10 22:53:23 +000014044static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 (lenfunc) unicode_length, /* sq_length */
14046 PyUnicode_Concat, /* sq_concat */
14047 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14048 (ssizeargfunc) unicode_getitem, /* sq_item */
14049 0, /* sq_slice */
14050 0, /* sq_ass_item */
14051 0, /* sq_ass_slice */
14052 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014053};
14054
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014055static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014056unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014058 if (PyUnicode_READY(self) == -1)
14059 return NULL;
14060
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014061 if (PyIndex_Check(item)) {
14062 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014063 if (i == -1 && PyErr_Occurred())
14064 return NULL;
14065 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014066 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014067 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014068 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014069 Py_ssize_t start, stop, step, slicelength, i;
14070 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014071 PyObject *result;
14072 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014073 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014074 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014075
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014076 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014077 return NULL;
14078 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014079 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14080 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014081
14082 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014083 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014084 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014085 slicelength == PyUnicode_GET_LENGTH(self)) {
14086 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014087 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014088 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014089 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014090 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014091 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014092 src_kind = PyUnicode_KIND(self);
14093 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014094 if (!PyUnicode_IS_ASCII(self)) {
14095 kind_limit = kind_maxchar_limit(src_kind);
14096 max_char = 0;
14097 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14098 ch = PyUnicode_READ(src_kind, src_data, cur);
14099 if (ch > max_char) {
14100 max_char = ch;
14101 if (max_char >= kind_limit)
14102 break;
14103 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014104 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014105 }
Victor Stinner55c99112011-10-13 01:17:06 +020014106 else
14107 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014108 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014109 if (result == NULL)
14110 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014111 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014112 dest_data = PyUnicode_DATA(result);
14113
14114 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014115 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14116 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014117 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014118 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014119 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014120 } else {
14121 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14122 return NULL;
14123 }
14124}
14125
14126static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014127 (lenfunc)unicode_length, /* mp_length */
14128 (binaryfunc)unicode_subscript, /* mp_subscript */
14129 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014130};
14131
Guido van Rossumd57fd912000-03-10 22:53:23 +000014132
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133/* Helpers for PyUnicode_Format() */
14134
Victor Stinnera47082312012-10-04 02:19:54 +020014135struct unicode_formatter_t {
14136 PyObject *args;
14137 int args_owned;
14138 Py_ssize_t arglen, argidx;
14139 PyObject *dict;
14140
14141 enum PyUnicode_Kind fmtkind;
14142 Py_ssize_t fmtcnt, fmtpos;
14143 void *fmtdata;
14144 PyObject *fmtstr;
14145
14146 _PyUnicodeWriter writer;
14147};
14148
14149struct unicode_format_arg_t {
14150 Py_UCS4 ch;
14151 int flags;
14152 Py_ssize_t width;
14153 int prec;
14154 int sign;
14155};
14156
Guido van Rossumd57fd912000-03-10 22:53:23 +000014157static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014158unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014159{
Victor Stinnera47082312012-10-04 02:19:54 +020014160 Py_ssize_t argidx = ctx->argidx;
14161
14162 if (argidx < ctx->arglen) {
14163 ctx->argidx++;
14164 if (ctx->arglen < 0)
14165 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 else
Victor Stinnera47082312012-10-04 02:19:54 +020014167 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014168 }
14169 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014170 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014171 return NULL;
14172}
14173
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014174/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175
Victor Stinnera47082312012-10-04 02:19:54 +020014176/* Format a float into the writer if the writer is not NULL, or into *p_output
14177 otherwise.
14178
14179 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014180static int
Victor Stinnera47082312012-10-04 02:19:54 +020014181formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14182 PyObject **p_output,
14183 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014184{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014185 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014187 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014188 int prec;
14189 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014190
Guido van Rossumd57fd912000-03-10 22:53:23 +000014191 x = PyFloat_AsDouble(v);
14192 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014193 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014194
Victor Stinnera47082312012-10-04 02:19:54 +020014195 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014197 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014198
Victor Stinnera47082312012-10-04 02:19:54 +020014199 if (arg->flags & F_ALT)
14200 dtoa_flags = Py_DTSF_ALT;
14201 else
14202 dtoa_flags = 0;
14203 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014204 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014205 return -1;
14206 len = strlen(p);
14207 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014208 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014209 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014210 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014211 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014212 }
14213 else
14214 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014215 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014216 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014217}
14218
Victor Stinnerd0880d52012-04-27 23:40:13 +020014219/* formatlong() emulates the format codes d, u, o, x and X, and
14220 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14221 * Python's regular ints.
14222 * Return value: a new PyUnicodeObject*, or NULL if error.
14223 * The output string is of the form
14224 * "-"? ("0x" | "0X")? digit+
14225 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14226 * set in flags. The case of hex digits will be correct,
14227 * There will be at least prec digits, zero-filled on the left if
14228 * necessary to get that many.
14229 * val object to be converted
14230 * flags bitmask of format flags; only F_ALT is looked at
14231 * prec minimum number of digits; 0-fill on left if needed
14232 * type a character in [duoxX]; u acts the same as d
14233 *
14234 * CAUTION: o, x and X conversions on regular ints can never
14235 * produce a '-' sign, but can for Python's unbounded ints.
14236 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014237PyObject *
14238_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014239{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014240 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014241 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014242 Py_ssize_t i;
14243 int sign; /* 1 if '-', else 0 */
14244 int len; /* number of characters */
14245 Py_ssize_t llen;
14246 int numdigits; /* len == numnondigits + numdigits */
14247 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014248
Victor Stinnerd0880d52012-04-27 23:40:13 +020014249 /* Avoid exceeding SSIZE_T_MAX */
14250 if (prec > INT_MAX-3) {
14251 PyErr_SetString(PyExc_OverflowError,
14252 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014254 }
14255
14256 assert(PyLong_Check(val));
14257
14258 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014259 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014260 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014261 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014262 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014263 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014264 /* int and int subclasses should print numerically when a numeric */
14265 /* format code is used (see issue18780) */
14266 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014267 break;
14268 case 'o':
14269 numnondigits = 2;
14270 result = PyNumber_ToBase(val, 8);
14271 break;
14272 case 'x':
14273 case 'X':
14274 numnondigits = 2;
14275 result = PyNumber_ToBase(val, 16);
14276 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014277 }
14278 if (!result)
14279 return NULL;
14280
14281 assert(unicode_modifiable(result));
14282 assert(PyUnicode_IS_READY(result));
14283 assert(PyUnicode_IS_ASCII(result));
14284
14285 /* To modify the string in-place, there can only be one reference. */
14286 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014287 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014288 PyErr_BadInternalCall();
14289 return NULL;
14290 }
14291 buf = PyUnicode_DATA(result);
14292 llen = PyUnicode_GET_LENGTH(result);
14293 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014294 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014295 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014296 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014297 return NULL;
14298 }
14299 len = (int)llen;
14300 sign = buf[0] == '-';
14301 numnondigits += sign;
14302 numdigits = len - numnondigits;
14303 assert(numdigits > 0);
14304
14305 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014306 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014307 (type == 'o' || type == 'x' || type == 'X'))) {
14308 assert(buf[sign] == '0');
14309 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14310 buf[sign+1] == 'o');
14311 numnondigits -= 2;
14312 buf += 2;
14313 len -= 2;
14314 if (sign)
14315 buf[0] = '-';
14316 assert(len == numnondigits + numdigits);
14317 assert(numdigits > 0);
14318 }
14319
14320 /* Fill with leading zeroes to meet minimum width. */
14321 if (prec > numdigits) {
14322 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14323 numnondigits + prec);
14324 char *b1;
14325 if (!r1) {
14326 Py_DECREF(result);
14327 return NULL;
14328 }
14329 b1 = PyBytes_AS_STRING(r1);
14330 for (i = 0; i < numnondigits; ++i)
14331 *b1++ = *buf++;
14332 for (i = 0; i < prec - numdigits; i++)
14333 *b1++ = '0';
14334 for (i = 0; i < numdigits; i++)
14335 *b1++ = *buf++;
14336 *b1 = '\0';
14337 Py_DECREF(result);
14338 result = r1;
14339 buf = PyBytes_AS_STRING(result);
14340 len = numnondigits + prec;
14341 }
14342
14343 /* Fix up case for hex conversions. */
14344 if (type == 'X') {
14345 /* Need to convert all lower case letters to upper case.
14346 and need to convert 0x to 0X (and -0x to -0X). */
14347 for (i = 0; i < len; i++)
14348 if (buf[i] >= 'a' && buf[i] <= 'x')
14349 buf[i] -= 'a'-'A';
14350 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014351 if (!PyUnicode_Check(result)
14352 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014353 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014354 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014355 Py_DECREF(result);
14356 result = unicode;
14357 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014358 else if (len != PyUnicode_GET_LENGTH(result)) {
14359 if (PyUnicode_Resize(&result, len) < 0)
14360 Py_CLEAR(result);
14361 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014363}
14364
Ethan Furmandf3ed242014-01-05 06:50:30 -080014365/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014366 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014367 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 * -1 and raise an exception on error */
14369static int
Victor Stinnera47082312012-10-04 02:19:54 +020014370mainformatlong(PyObject *v,
14371 struct unicode_format_arg_t *arg,
14372 PyObject **p_output,
14373 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014374{
14375 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014376 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014377
14378 if (!PyNumber_Check(v))
14379 goto wrongtype;
14380
Ethan Furman9ab74802014-03-21 06:38:46 -070014381 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014382 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014383 if (type == 'o' || type == 'x' || type == 'X') {
14384 iobj = PyNumber_Index(v);
14385 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014386 if (PyErr_ExceptionMatches(PyExc_TypeError))
14387 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014388 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014389 }
14390 }
14391 else {
14392 iobj = PyNumber_Long(v);
14393 if (iobj == NULL ) {
14394 if (PyErr_ExceptionMatches(PyExc_TypeError))
14395 goto wrongtype;
14396 return -1;
14397 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014398 }
14399 assert(PyLong_Check(iobj));
14400 }
14401 else {
14402 iobj = v;
14403 Py_INCREF(iobj);
14404 }
14405
14406 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014407 && arg->width == -1 && arg->prec == -1
14408 && !(arg->flags & (F_SIGN | F_BLANK))
14409 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014410 {
14411 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014412 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014413 int base;
14414
Victor Stinnera47082312012-10-04 02:19:54 +020014415 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014416 {
14417 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014418 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014419 case 'd':
14420 case 'i':
14421 case 'u':
14422 base = 10;
14423 break;
14424 case 'o':
14425 base = 8;
14426 break;
14427 case 'x':
14428 case 'X':
14429 base = 16;
14430 break;
14431 }
14432
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014433 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14434 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014435 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014436 }
14437 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014438 return 1;
14439 }
14440
Ethan Furmanb95b5612015-01-23 20:05:18 -080014441 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014442 Py_DECREF(iobj);
14443 if (res == NULL)
14444 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014445 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014446 return 0;
14447
14448wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014449 switch(type)
14450 {
14451 case 'o':
14452 case 'x':
14453 case 'X':
14454 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014455 "%%%c format: an integer is required, "
14456 "not %.200s",
14457 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014458 break;
14459 default:
14460 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014461 "%%%c format: a number is required, "
14462 "not %.200s",
14463 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014464 break;
14465 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014466 return -1;
14467}
14468
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014469static Py_UCS4
14470formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014471{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014472 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014473 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014474 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014475 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014476 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014477 goto onError;
14478 }
14479 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014480 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014481 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014482 /* make sure number is a type of integer */
14483 if (!PyLong_Check(v)) {
14484 iobj = PyNumber_Index(v);
14485 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014486 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014487 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014488 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014489 Py_DECREF(iobj);
14490 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014491 else {
14492 x = PyLong_AsLong(v);
14493 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014494 if (x == -1 && PyErr_Occurred())
14495 goto onError;
14496
Victor Stinner8faf8212011-12-08 22:14:11 +010014497 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014498 PyErr_SetString(PyExc_OverflowError,
14499 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014500 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014501 }
14502
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014503 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014505
Benjamin Peterson29060642009-01-31 22:14:21 +000014506 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014507 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014508 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014509 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014510}
14511
Victor Stinnera47082312012-10-04 02:19:54 +020014512/* Parse options of an argument: flags, width, precision.
14513 Handle also "%(name)" syntax.
14514
14515 Return 0 if the argument has been formatted into arg->str.
14516 Return 1 if the argument has been written into ctx->writer,
14517 Raise an exception and return -1 on error. */
14518static int
14519unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14520 struct unicode_format_arg_t *arg)
14521{
14522#define FORMAT_READ(ctx) \
14523 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14524
14525 PyObject *v;
14526
Victor Stinnera47082312012-10-04 02:19:54 +020014527 if (arg->ch == '(') {
14528 /* Get argument value from a dictionary. Example: "%(name)s". */
14529 Py_ssize_t keystart;
14530 Py_ssize_t keylen;
14531 PyObject *key;
14532 int pcount = 1;
14533
14534 if (ctx->dict == NULL) {
14535 PyErr_SetString(PyExc_TypeError,
14536 "format requires a mapping");
14537 return -1;
14538 }
14539 ++ctx->fmtpos;
14540 --ctx->fmtcnt;
14541 keystart = ctx->fmtpos;
14542 /* Skip over balanced parentheses */
14543 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14544 arg->ch = FORMAT_READ(ctx);
14545 if (arg->ch == ')')
14546 --pcount;
14547 else if (arg->ch == '(')
14548 ++pcount;
14549 ctx->fmtpos++;
14550 }
14551 keylen = ctx->fmtpos - keystart - 1;
14552 if (ctx->fmtcnt < 0 || pcount > 0) {
14553 PyErr_SetString(PyExc_ValueError,
14554 "incomplete format key");
14555 return -1;
14556 }
14557 key = PyUnicode_Substring(ctx->fmtstr,
14558 keystart, keystart + keylen);
14559 if (key == NULL)
14560 return -1;
14561 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014562 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014563 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014564 }
14565 ctx->args = PyObject_GetItem(ctx->dict, key);
14566 Py_DECREF(key);
14567 if (ctx->args == NULL)
14568 return -1;
14569 ctx->args_owned = 1;
14570 ctx->arglen = -1;
14571 ctx->argidx = -2;
14572 }
14573
14574 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014575 while (--ctx->fmtcnt >= 0) {
14576 arg->ch = FORMAT_READ(ctx);
14577 ctx->fmtpos++;
14578 switch (arg->ch) {
14579 case '-': arg->flags |= F_LJUST; continue;
14580 case '+': arg->flags |= F_SIGN; continue;
14581 case ' ': arg->flags |= F_BLANK; continue;
14582 case '#': arg->flags |= F_ALT; continue;
14583 case '0': arg->flags |= F_ZERO; continue;
14584 }
14585 break;
14586 }
14587
14588 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014589 if (arg->ch == '*') {
14590 v = unicode_format_getnextarg(ctx);
14591 if (v == NULL)
14592 return -1;
14593 if (!PyLong_Check(v)) {
14594 PyErr_SetString(PyExc_TypeError,
14595 "* wants int");
14596 return -1;
14597 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014598 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014599 if (arg->width == -1 && PyErr_Occurred())
14600 return -1;
14601 if (arg->width < 0) {
14602 arg->flags |= F_LJUST;
14603 arg->width = -arg->width;
14604 }
14605 if (--ctx->fmtcnt >= 0) {
14606 arg->ch = FORMAT_READ(ctx);
14607 ctx->fmtpos++;
14608 }
14609 }
14610 else if (arg->ch >= '0' && arg->ch <= '9') {
14611 arg->width = arg->ch - '0';
14612 while (--ctx->fmtcnt >= 0) {
14613 arg->ch = FORMAT_READ(ctx);
14614 ctx->fmtpos++;
14615 if (arg->ch < '0' || arg->ch > '9')
14616 break;
14617 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14618 mixing signed and unsigned comparison. Since arg->ch is between
14619 '0' and '9', casting to int is safe. */
14620 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14621 PyErr_SetString(PyExc_ValueError,
14622 "width too big");
14623 return -1;
14624 }
14625 arg->width = arg->width*10 + (arg->ch - '0');
14626 }
14627 }
14628
14629 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014630 if (arg->ch == '.') {
14631 arg->prec = 0;
14632 if (--ctx->fmtcnt >= 0) {
14633 arg->ch = FORMAT_READ(ctx);
14634 ctx->fmtpos++;
14635 }
14636 if (arg->ch == '*') {
14637 v = unicode_format_getnextarg(ctx);
14638 if (v == NULL)
14639 return -1;
14640 if (!PyLong_Check(v)) {
14641 PyErr_SetString(PyExc_TypeError,
14642 "* wants int");
14643 return -1;
14644 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014645 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014646 if (arg->prec == -1 && PyErr_Occurred())
14647 return -1;
14648 if (arg->prec < 0)
14649 arg->prec = 0;
14650 if (--ctx->fmtcnt >= 0) {
14651 arg->ch = FORMAT_READ(ctx);
14652 ctx->fmtpos++;
14653 }
14654 }
14655 else if (arg->ch >= '0' && arg->ch <= '9') {
14656 arg->prec = arg->ch - '0';
14657 while (--ctx->fmtcnt >= 0) {
14658 arg->ch = FORMAT_READ(ctx);
14659 ctx->fmtpos++;
14660 if (arg->ch < '0' || arg->ch > '9')
14661 break;
14662 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14663 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014664 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014665 return -1;
14666 }
14667 arg->prec = arg->prec*10 + (arg->ch - '0');
14668 }
14669 }
14670 }
14671
14672 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14673 if (ctx->fmtcnt >= 0) {
14674 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14675 if (--ctx->fmtcnt >= 0) {
14676 arg->ch = FORMAT_READ(ctx);
14677 ctx->fmtpos++;
14678 }
14679 }
14680 }
14681 if (ctx->fmtcnt < 0) {
14682 PyErr_SetString(PyExc_ValueError,
14683 "incomplete format");
14684 return -1;
14685 }
14686 return 0;
14687
14688#undef FORMAT_READ
14689}
14690
14691/* Format one argument. Supported conversion specifiers:
14692
14693 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014694 - "i", "d", "u": int or float
14695 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014696 - "e", "E", "f", "F", "g", "G": float
14697 - "c": int or str (1 character)
14698
Victor Stinner8dbd4212012-12-04 09:30:24 +010014699 When possible, the output is written directly into the Unicode writer
14700 (ctx->writer). A string is created when padding is required.
14701
Victor Stinnera47082312012-10-04 02:19:54 +020014702 Return 0 if the argument has been formatted into *p_str,
14703 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014704 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014705static int
14706unicode_format_arg_format(struct unicode_formatter_t *ctx,
14707 struct unicode_format_arg_t *arg,
14708 PyObject **p_str)
14709{
14710 PyObject *v;
14711 _PyUnicodeWriter *writer = &ctx->writer;
14712
14713 if (ctx->fmtcnt == 0)
14714 ctx->writer.overallocate = 0;
14715
Victor Stinnera47082312012-10-04 02:19:54 +020014716 v = unicode_format_getnextarg(ctx);
14717 if (v == NULL)
14718 return -1;
14719
Victor Stinnera47082312012-10-04 02:19:54 +020014720
14721 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014722 case 's':
14723 case 'r':
14724 case 'a':
14725 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14726 /* Fast path */
14727 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14728 return -1;
14729 return 1;
14730 }
14731
14732 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14733 *p_str = v;
14734 Py_INCREF(*p_str);
14735 }
14736 else {
14737 if (arg->ch == 's')
14738 *p_str = PyObject_Str(v);
14739 else if (arg->ch == 'r')
14740 *p_str = PyObject_Repr(v);
14741 else
14742 *p_str = PyObject_ASCII(v);
14743 }
14744 break;
14745
14746 case 'i':
14747 case 'd':
14748 case 'u':
14749 case 'o':
14750 case 'x':
14751 case 'X':
14752 {
14753 int ret = mainformatlong(v, arg, p_str, writer);
14754 if (ret != 0)
14755 return ret;
14756 arg->sign = 1;
14757 break;
14758 }
14759
14760 case 'e':
14761 case 'E':
14762 case 'f':
14763 case 'F':
14764 case 'g':
14765 case 'G':
14766 if (arg->width == -1 && arg->prec == -1
14767 && !(arg->flags & (F_SIGN | F_BLANK)))
14768 {
14769 /* Fast path */
14770 if (formatfloat(v, arg, NULL, writer) == -1)
14771 return -1;
14772 return 1;
14773 }
14774
14775 arg->sign = 1;
14776 if (formatfloat(v, arg, p_str, NULL) == -1)
14777 return -1;
14778 break;
14779
14780 case 'c':
14781 {
14782 Py_UCS4 ch = formatchar(v);
14783 if (ch == (Py_UCS4) -1)
14784 return -1;
14785 if (arg->width == -1 && arg->prec == -1) {
14786 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014787 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014788 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014789 return 1;
14790 }
14791 *p_str = PyUnicode_FromOrdinal(ch);
14792 break;
14793 }
14794
14795 default:
14796 PyErr_Format(PyExc_ValueError,
14797 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014798 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014799 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14800 (int)arg->ch,
14801 ctx->fmtpos - 1);
14802 return -1;
14803 }
14804 if (*p_str == NULL)
14805 return -1;
14806 assert (PyUnicode_Check(*p_str));
14807 return 0;
14808}
14809
14810static int
14811unicode_format_arg_output(struct unicode_formatter_t *ctx,
14812 struct unicode_format_arg_t *arg,
14813 PyObject *str)
14814{
14815 Py_ssize_t len;
14816 enum PyUnicode_Kind kind;
14817 void *pbuf;
14818 Py_ssize_t pindex;
14819 Py_UCS4 signchar;
14820 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014821 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014822 Py_ssize_t sublen;
14823 _PyUnicodeWriter *writer = &ctx->writer;
14824 Py_UCS4 fill;
14825
14826 fill = ' ';
14827 if (arg->sign && arg->flags & F_ZERO)
14828 fill = '0';
14829
14830 if (PyUnicode_READY(str) == -1)
14831 return -1;
14832
14833 len = PyUnicode_GET_LENGTH(str);
14834 if ((arg->width == -1 || arg->width <= len)
14835 && (arg->prec == -1 || arg->prec >= len)
14836 && !(arg->flags & (F_SIGN | F_BLANK)))
14837 {
14838 /* Fast path */
14839 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14840 return -1;
14841 return 0;
14842 }
14843
14844 /* Truncate the string for "s", "r" and "a" formats
14845 if the precision is set */
14846 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14847 if (arg->prec >= 0 && len > arg->prec)
14848 len = arg->prec;
14849 }
14850
14851 /* Adjust sign and width */
14852 kind = PyUnicode_KIND(str);
14853 pbuf = PyUnicode_DATA(str);
14854 pindex = 0;
14855 signchar = '\0';
14856 if (arg->sign) {
14857 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14858 if (ch == '-' || ch == '+') {
14859 signchar = ch;
14860 len--;
14861 pindex++;
14862 }
14863 else if (arg->flags & F_SIGN)
14864 signchar = '+';
14865 else if (arg->flags & F_BLANK)
14866 signchar = ' ';
14867 else
14868 arg->sign = 0;
14869 }
14870 if (arg->width < len)
14871 arg->width = len;
14872
14873 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014874 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014875 if (!(arg->flags & F_LJUST)) {
14876 if (arg->sign) {
14877 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014878 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014879 }
14880 else {
14881 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014882 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014883 }
14884 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014885 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14886 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014887 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014888 }
14889
Victor Stinnera47082312012-10-04 02:19:54 +020014890 buflen = arg->width;
14891 if (arg->sign && len == arg->width)
14892 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014893 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014894 return -1;
14895
14896 /* Write the sign if needed */
14897 if (arg->sign) {
14898 if (fill != ' ') {
14899 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14900 writer->pos += 1;
14901 }
14902 if (arg->width > len)
14903 arg->width--;
14904 }
14905
14906 /* Write the numeric prefix for "x", "X" and "o" formats
14907 if the alternate form is used.
14908 For example, write "0x" for the "%#x" format. */
14909 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14910 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14911 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14912 if (fill != ' ') {
14913 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14914 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14915 writer->pos += 2;
14916 pindex += 2;
14917 }
14918 arg->width -= 2;
14919 if (arg->width < 0)
14920 arg->width = 0;
14921 len -= 2;
14922 }
14923
14924 /* Pad left with the fill character if needed */
14925 if (arg->width > len && !(arg->flags & F_LJUST)) {
14926 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014927 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014928 writer->pos += sublen;
14929 arg->width = len;
14930 }
14931
14932 /* If padding with spaces: write sign if needed and/or numeric prefix if
14933 the alternate form is used */
14934 if (fill == ' ') {
14935 if (arg->sign) {
14936 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14937 writer->pos += 1;
14938 }
14939 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14940 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14941 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14942 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14943 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14944 writer->pos += 2;
14945 pindex += 2;
14946 }
14947 }
14948
14949 /* Write characters */
14950 if (len) {
14951 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14952 str, pindex, len);
14953 writer->pos += len;
14954 }
14955
14956 /* Pad right with the fill character if needed */
14957 if (arg->width > len) {
14958 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014959 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014960 writer->pos += sublen;
14961 }
14962 return 0;
14963}
14964
14965/* Helper of PyUnicode_Format(): format one arg.
14966 Return 0 on success, raise an exception and return -1 on error. */
14967static int
14968unicode_format_arg(struct unicode_formatter_t *ctx)
14969{
14970 struct unicode_format_arg_t arg;
14971 PyObject *str;
14972 int ret;
14973
Victor Stinner8dbd4212012-12-04 09:30:24 +010014974 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014975 if (arg.ch == '%') {
14976 ctx->fmtpos++;
14977 ctx->fmtcnt--;
14978 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14979 return -1;
14980 return 0;
14981 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014982 arg.flags = 0;
14983 arg.width = -1;
14984 arg.prec = -1;
14985 arg.sign = 0;
14986 str = NULL;
14987
Victor Stinnera47082312012-10-04 02:19:54 +020014988 ret = unicode_format_arg_parse(ctx, &arg);
14989 if (ret == -1)
14990 return -1;
14991
14992 ret = unicode_format_arg_format(ctx, &arg, &str);
14993 if (ret == -1)
14994 return -1;
14995
14996 if (ret != 1) {
14997 ret = unicode_format_arg_output(ctx, &arg, str);
14998 Py_DECREF(str);
14999 if (ret == -1)
15000 return -1;
15001 }
15002
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015003 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015004 PyErr_SetString(PyExc_TypeError,
15005 "not all arguments converted during string formatting");
15006 return -1;
15007 }
15008 return 0;
15009}
15010
Alexander Belopolsky40018472011-02-26 01:02:56 +000015011PyObject *
15012PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015013{
Victor Stinnera47082312012-10-04 02:19:54 +020015014 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015015
Guido van Rossumd57fd912000-03-10 22:53:23 +000015016 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015017 PyErr_BadInternalCall();
15018 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015019 }
Victor Stinnera47082312012-10-04 02:19:54 +020015020
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015021 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015022 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015023
15024 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015025 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15026 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15027 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15028 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015029
Victor Stinner8f674cc2013-04-17 23:02:17 +020015030 _PyUnicodeWriter_Init(&ctx.writer);
15031 ctx.writer.min_length = ctx.fmtcnt + 100;
15032 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015033
Guido van Rossumd57fd912000-03-10 22:53:23 +000015034 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015035 ctx.arglen = PyTuple_Size(args);
15036 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015037 }
15038 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015039 ctx.arglen = -1;
15040 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015041 }
Victor Stinnera47082312012-10-04 02:19:54 +020015042 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015043 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015044 ctx.dict = args;
15045 else
15046 ctx.dict = NULL;
15047 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015048
Victor Stinnera47082312012-10-04 02:19:54 +020015049 while (--ctx.fmtcnt >= 0) {
15050 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015051 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015052
15053 nonfmtpos = ctx.fmtpos++;
15054 while (ctx.fmtcnt >= 0 &&
15055 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15056 ctx.fmtpos++;
15057 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015058 }
Victor Stinnera47082312012-10-04 02:19:54 +020015059 if (ctx.fmtcnt < 0) {
15060 ctx.fmtpos--;
15061 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015062 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015063
Victor Stinnercfc4c132013-04-03 01:48:39 +020015064 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15065 nonfmtpos, ctx.fmtpos) < 0)
15066 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 }
15068 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015069 ctx.fmtpos++;
15070 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015071 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015072 }
15073 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015074
Victor Stinnera47082312012-10-04 02:19:54 +020015075 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015076 PyErr_SetString(PyExc_TypeError,
15077 "not all arguments converted during string formatting");
15078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015079 }
15080
Victor Stinnera47082312012-10-04 02:19:54 +020015081 if (ctx.args_owned) {
15082 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015083 }
Victor Stinnera47082312012-10-04 02:19:54 +020015084 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015085
Benjamin Peterson29060642009-01-31 22:14:21 +000015086 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015087 _PyUnicodeWriter_Dealloc(&ctx.writer);
15088 if (ctx.args_owned) {
15089 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015090 }
15091 return NULL;
15092}
15093
Jeremy Hylton938ace62002-07-17 16:30:39 +000015094static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015095unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15096
Tim Peters6d6c1a32001-08-02 04:15:00 +000015097static PyObject *
15098unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15099{
Benjamin Peterson29060642009-01-31 22:14:21 +000015100 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 static char *kwlist[] = {"object", "encoding", "errors", 0};
15102 char *encoding = NULL;
15103 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015104
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 if (type != &PyUnicode_Type)
15106 return unicode_subtype_new(type, args, kwds);
15107 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015108 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 return NULL;
15110 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015111 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 if (encoding == NULL && errors == NULL)
15113 return PyObject_Str(x);
15114 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015115 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015116}
15117
Guido van Rossume023fe02001-08-30 03:12:59 +000015118static PyObject *
15119unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15120{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015121 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015122 Py_ssize_t length, char_size;
15123 int share_wstr, share_utf8;
15124 unsigned int kind;
15125 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015126
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015128
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015129 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015130 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015132 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015133 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015134 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015135 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015136 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015137
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015138 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015139 if (self == NULL) {
15140 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 return NULL;
15142 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015143 kind = PyUnicode_KIND(unicode);
15144 length = PyUnicode_GET_LENGTH(unicode);
15145
15146 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015147#ifdef Py_DEBUG
15148 _PyUnicode_HASH(self) = -1;
15149#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015150 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015151#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015152 _PyUnicode_STATE(self).interned = 0;
15153 _PyUnicode_STATE(self).kind = kind;
15154 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015155 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015156 _PyUnicode_STATE(self).ready = 1;
15157 _PyUnicode_WSTR(self) = NULL;
15158 _PyUnicode_UTF8_LENGTH(self) = 0;
15159 _PyUnicode_UTF8(self) = NULL;
15160 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015161 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015162
15163 share_utf8 = 0;
15164 share_wstr = 0;
15165 if (kind == PyUnicode_1BYTE_KIND) {
15166 char_size = 1;
15167 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15168 share_utf8 = 1;
15169 }
15170 else if (kind == PyUnicode_2BYTE_KIND) {
15171 char_size = 2;
15172 if (sizeof(wchar_t) == 2)
15173 share_wstr = 1;
15174 }
15175 else {
15176 assert(kind == PyUnicode_4BYTE_KIND);
15177 char_size = 4;
15178 if (sizeof(wchar_t) == 4)
15179 share_wstr = 1;
15180 }
15181
15182 /* Ensure we won't overflow the length. */
15183 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15184 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015185 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015187 data = PyObject_MALLOC((length + 1) * char_size);
15188 if (data == NULL) {
15189 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015190 goto onError;
15191 }
15192
Victor Stinnerc3c74152011-10-02 20:39:55 +020015193 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015194 if (share_utf8) {
15195 _PyUnicode_UTF8_LENGTH(self) = length;
15196 _PyUnicode_UTF8(self) = data;
15197 }
15198 if (share_wstr) {
15199 _PyUnicode_WSTR_LENGTH(self) = length;
15200 _PyUnicode_WSTR(self) = (wchar_t *)data;
15201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015202
Christian Heimesf051e432016-09-13 20:22:02 +020015203 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015204 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015205 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015206#ifdef Py_DEBUG
15207 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15208#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015209 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015210 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015211
15212onError:
15213 Py_DECREF(unicode);
15214 Py_DECREF(self);
15215 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015216}
15217
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015218PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015219"str(object='') -> str\n\
15220str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015221\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015222Create a new string object from the given object. If encoding or\n\
15223errors is specified, then the object must expose a data buffer\n\
15224that will be decoded using the given encoding and error handler.\n\
15225Otherwise, returns the result of object.__str__() (if defined)\n\
15226or repr(object).\n\
15227encoding defaults to sys.getdefaultencoding().\n\
15228errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015229
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015230static PyObject *unicode_iter(PyObject *seq);
15231
Guido van Rossumd57fd912000-03-10 22:53:23 +000015232PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015233 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015234 "str", /* tp_name */
15235 sizeof(PyUnicodeObject), /* tp_basicsize */
15236 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015237 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015238 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015239 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015240 0, /* tp_getattr */
15241 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015242 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015243 unicode_repr, /* tp_repr */
15244 &unicode_as_number, /* tp_as_number */
15245 &unicode_as_sequence, /* tp_as_sequence */
15246 &unicode_as_mapping, /* tp_as_mapping */
15247 (hashfunc) unicode_hash, /* tp_hash*/
15248 0, /* tp_call*/
15249 (reprfunc) unicode_str, /* tp_str */
15250 PyObject_GenericGetAttr, /* tp_getattro */
15251 0, /* tp_setattro */
15252 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015254 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15255 unicode_doc, /* tp_doc */
15256 0, /* tp_traverse */
15257 0, /* tp_clear */
15258 PyUnicode_RichCompare, /* tp_richcompare */
15259 0, /* tp_weaklistoffset */
15260 unicode_iter, /* tp_iter */
15261 0, /* tp_iternext */
15262 unicode_methods, /* tp_methods */
15263 0, /* tp_members */
15264 0, /* tp_getset */
15265 &PyBaseObject_Type, /* tp_base */
15266 0, /* tp_dict */
15267 0, /* tp_descr_get */
15268 0, /* tp_descr_set */
15269 0, /* tp_dictoffset */
15270 0, /* tp_init */
15271 0, /* tp_alloc */
15272 unicode_new, /* tp_new */
15273 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015274};
15275
15276/* Initialize the Unicode implementation */
15277
Victor Stinner331a6a52019-05-27 16:39:22 +020015278PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015279_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015280{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015281 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015282 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015283 0x000A, /* LINE FEED */
15284 0x000D, /* CARRIAGE RETURN */
15285 0x001C, /* FILE SEPARATOR */
15286 0x001D, /* GROUP SEPARATOR */
15287 0x001E, /* RECORD SEPARATOR */
15288 0x0085, /* NEXT LINE */
15289 0x2028, /* LINE SEPARATOR */
15290 0x2029, /* PARAGRAPH SEPARATOR */
15291 };
15292
Fred Drakee4315f52000-05-09 19:53:39 +000015293 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015294 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015295 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015296 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015297 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015298 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015299
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015300 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015301 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015302 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015303
15304 /* initialize the linebreak bloom filter */
15305 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015306 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015307 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015308
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015309 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015310 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015311 }
15312 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015313 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015314 }
15315 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015316 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015317 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015318 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015319}
15320
15321/* Finalize the Unicode implementation */
15322
Christian Heimesa156e092008-02-16 07:38:31 +000015323int
15324PyUnicode_ClearFreeList(void)
15325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015326 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015327}
15328
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015329
Walter Dörwald16807132007-05-25 13:52:07 +000015330void
15331PyUnicode_InternInPlace(PyObject **p)
15332{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015333 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015334 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015335#ifdef Py_DEBUG
15336 assert(s != NULL);
15337 assert(_PyUnicode_CHECK(s));
15338#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015340 return;
15341#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015342 /* If it's a subclass, we don't really know what putting
15343 it in the interned dict might do. */
15344 if (!PyUnicode_CheckExact(s))
15345 return;
15346 if (PyUnicode_CHECK_INTERNED(s))
15347 return;
15348 if (interned == NULL) {
15349 interned = PyDict_New();
15350 if (interned == NULL) {
15351 PyErr_Clear(); /* Don't leave an exception */
15352 return;
15353 }
15354 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015355 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015356 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015357 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015358 if (t == NULL) {
15359 PyErr_Clear();
15360 return;
15361 }
15362 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015363 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015364 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015365 return;
15366 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 /* The two references in interned are not counted by refcnt.
15368 The deallocator will take care of this */
15369 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015370 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015371}
15372
15373void
15374PyUnicode_InternImmortal(PyObject **p)
15375{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015376 PyUnicode_InternInPlace(p);
15377 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015378 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015379 Py_INCREF(*p);
15380 }
Walter Dörwald16807132007-05-25 13:52:07 +000015381}
15382
15383PyObject *
15384PyUnicode_InternFromString(const char *cp)
15385{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015386 PyObject *s = PyUnicode_FromString(cp);
15387 if (s == NULL)
15388 return NULL;
15389 PyUnicode_InternInPlace(&s);
15390 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015391}
15392
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015393
15394#if defined(WITH_VALGRIND) || defined(__INSURE__)
15395static void
15396unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015397{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015399 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 Py_ssize_t i, n;
15401 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015402
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 if (interned == NULL || !PyDict_Check(interned))
15404 return;
15405 keys = PyDict_Keys(interned);
15406 if (keys == NULL || !PyList_Check(keys)) {
15407 PyErr_Clear();
15408 return;
15409 }
Walter Dörwald16807132007-05-25 13:52:07 +000015410
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015411 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 detector, interned unicode strings are not forcibly deallocated;
15413 rather, we give them their stolen references back, and then clear
15414 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015415
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015417#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015419 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015420#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015422 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015423 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015424 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015426 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 case SSTATE_NOT_INTERNED:
15428 /* XXX Shouldn't happen */
15429 break;
15430 case SSTATE_INTERNED_IMMORTAL:
15431 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015432 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 break;
15434 case SSTATE_INTERNED_MORTAL:
15435 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015436 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 break;
15438 default:
15439 Py_FatalError("Inconsistent interned string state.");
15440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015441 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015443#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 fprintf(stderr, "total size of all interned strings: "
15445 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15446 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015447#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 Py_DECREF(keys);
15449 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015450 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015451}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015452#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015453
15454
15455/********************* Unicode Iterator **************************/
15456
15457typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015458 PyObject_HEAD
15459 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015460 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015461} unicodeiterobject;
15462
15463static void
15464unicodeiter_dealloc(unicodeiterobject *it)
15465{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015466 _PyObject_GC_UNTRACK(it);
15467 Py_XDECREF(it->it_seq);
15468 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015469}
15470
15471static int
15472unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15473{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 Py_VISIT(it->it_seq);
15475 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015476}
15477
15478static PyObject *
15479unicodeiter_next(unicodeiterobject *it)
15480{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015481 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015482
Benjamin Peterson14339b62009-01-31 16:36:08 +000015483 assert(it != NULL);
15484 seq = it->it_seq;
15485 if (seq == NULL)
15486 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015487 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015489 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15490 int kind = PyUnicode_KIND(seq);
15491 void *data = PyUnicode_DATA(seq);
15492 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15493 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 if (item != NULL)
15495 ++it->it_index;
15496 return item;
15497 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015498
Benjamin Peterson14339b62009-01-31 16:36:08 +000015499 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015500 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015502}
15503
15504static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015505unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015506{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015507 Py_ssize_t len = 0;
15508 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015509 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015511}
15512
15513PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15514
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015515static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015516unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015517{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015518 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015519 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015520 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015521 it->it_seq, it->it_index);
15522 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015523 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015524 if (u == NULL)
15525 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015526 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015527 }
15528}
15529
15530PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15531
15532static PyObject *
15533unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15534{
15535 Py_ssize_t index = PyLong_AsSsize_t(state);
15536 if (index == -1 && PyErr_Occurred())
15537 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015538 if (it->it_seq != NULL) {
15539 if (index < 0)
15540 index = 0;
15541 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15542 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15543 it->it_index = index;
15544 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015545 Py_RETURN_NONE;
15546}
15547
15548PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15549
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015550static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015551 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015552 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015553 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15554 reduce_doc},
15555 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15556 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015557 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015558};
15559
15560PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015561 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15562 "str_iterator", /* tp_name */
15563 sizeof(unicodeiterobject), /* tp_basicsize */
15564 0, /* tp_itemsize */
15565 /* methods */
15566 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015567 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015568 0, /* tp_getattr */
15569 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015570 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015571 0, /* tp_repr */
15572 0, /* tp_as_number */
15573 0, /* tp_as_sequence */
15574 0, /* tp_as_mapping */
15575 0, /* tp_hash */
15576 0, /* tp_call */
15577 0, /* tp_str */
15578 PyObject_GenericGetAttr, /* tp_getattro */
15579 0, /* tp_setattro */
15580 0, /* tp_as_buffer */
15581 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15582 0, /* tp_doc */
15583 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15584 0, /* tp_clear */
15585 0, /* tp_richcompare */
15586 0, /* tp_weaklistoffset */
15587 PyObject_SelfIter, /* tp_iter */
15588 (iternextfunc)unicodeiter_next, /* tp_iternext */
15589 unicodeiter_methods, /* tp_methods */
15590 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015591};
15592
15593static PyObject *
15594unicode_iter(PyObject *seq)
15595{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015596 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015597
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 if (!PyUnicode_Check(seq)) {
15599 PyErr_BadInternalCall();
15600 return NULL;
15601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015602 if (PyUnicode_READY(seq) == -1)
15603 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015604 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15605 if (it == NULL)
15606 return NULL;
15607 it->it_index = 0;
15608 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015609 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015610 _PyObject_GC_TRACK(it);
15611 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015612}
15613
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015614
15615size_t
15616Py_UNICODE_strlen(const Py_UNICODE *u)
15617{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015618 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015619}
15620
15621Py_UNICODE*
15622Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15623{
15624 Py_UNICODE *u = s1;
15625 while ((*u++ = *s2++));
15626 return s1;
15627}
15628
15629Py_UNICODE*
15630Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15631{
15632 Py_UNICODE *u = s1;
15633 while ((*u++ = *s2++))
15634 if (n-- == 0)
15635 break;
15636 return s1;
15637}
15638
15639Py_UNICODE*
15640Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15641{
15642 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015643 u1 += wcslen(u1);
15644 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015645 return s1;
15646}
15647
15648int
15649Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15650{
15651 while (*s1 && *s2 && *s1 == *s2)
15652 s1++, s2++;
15653 if (*s1 && *s2)
15654 return (*s1 < *s2) ? -1 : +1;
15655 if (*s1)
15656 return 1;
15657 if (*s2)
15658 return -1;
15659 return 0;
15660}
15661
15662int
15663Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15664{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015665 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015666 for (; n != 0; n--) {
15667 u1 = *s1;
15668 u2 = *s2;
15669 if (u1 != u2)
15670 return (u1 < u2) ? -1 : +1;
15671 if (u1 == '\0')
15672 return 0;
15673 s1++;
15674 s2++;
15675 }
15676 return 0;
15677}
15678
15679Py_UNICODE*
15680Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15681{
15682 const Py_UNICODE *p;
15683 for (p = s; *p; p++)
15684 if (*p == c)
15685 return (Py_UNICODE*)p;
15686 return NULL;
15687}
15688
15689Py_UNICODE*
15690Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15691{
15692 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015693 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015694 while (p != s) {
15695 p--;
15696 if (*p == c)
15697 return (Py_UNICODE*)p;
15698 }
15699 return NULL;
15700}
Victor Stinner331ea922010-08-10 16:37:20 +000015701
Victor Stinner71133ff2010-09-01 23:43:53 +000015702Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015703PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015704{
Victor Stinner577db2c2011-10-11 22:12:48 +020015705 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015706 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015708 if (!PyUnicode_Check(unicode)) {
15709 PyErr_BadArgument();
15710 return NULL;
15711 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015712 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015713 if (u == NULL)
15714 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015715 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015716 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015717 PyErr_NoMemory();
15718 return NULL;
15719 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015720 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015721 size *= sizeof(Py_UNICODE);
15722 copy = PyMem_Malloc(size);
15723 if (copy == NULL) {
15724 PyErr_NoMemory();
15725 return NULL;
15726 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015727 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015728 return copy;
15729}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015730
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015731
Victor Stinner709d23d2019-05-02 14:56:30 -040015732static int
15733encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015734{
Victor Stinner709d23d2019-05-02 14:56:30 -040015735 int res;
15736 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15737 if (res == -2) {
15738 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15739 return -1;
15740 }
15741 if (res < 0) {
15742 PyErr_NoMemory();
15743 return -1;
15744 }
15745 return 0;
15746}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015747
Victor Stinner709d23d2019-05-02 14:56:30 -040015748
15749static int
15750config_get_codec_name(wchar_t **config_encoding)
15751{
15752 char *encoding;
15753 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15754 return -1;
15755 }
15756
15757 PyObject *name_obj = NULL;
15758 PyObject *codec = _PyCodec_Lookup(encoding);
15759 PyMem_RawFree(encoding);
15760
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015761 if (!codec)
15762 goto error;
15763
15764 name_obj = PyObject_GetAttrString(codec, "name");
15765 Py_CLEAR(codec);
15766 if (!name_obj) {
15767 goto error;
15768 }
15769
Victor Stinner709d23d2019-05-02 14:56:30 -040015770 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15771 Py_DECREF(name_obj);
15772 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015773 goto error;
15774 }
15775
Victor Stinner709d23d2019-05-02 14:56:30 -040015776 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15777 if (raw_wname == NULL) {
15778 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015779 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015780 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015781 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015782
15783 PyMem_RawFree(*config_encoding);
15784 *config_encoding = raw_wname;
15785
15786 PyMem_Free(wname);
15787 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015788
15789error:
15790 Py_XDECREF(codec);
15791 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015792 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015793}
15794
15795
Victor Stinner331a6a52019-05-27 16:39:22 +020015796static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015797init_stdio_encoding(PyInterpreterState *interp)
15798{
Victor Stinner709d23d2019-05-02 14:56:30 -040015799 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015800 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015801 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015802 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015803 "of the stdio encoding");
15804 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015805 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015806}
15807
15808
Victor Stinner709d23d2019-05-02 14:56:30 -040015809static int
15810init_fs_codec(PyInterpreterState *interp)
15811{
Victor Stinner331a6a52019-05-27 16:39:22 +020015812 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015813
15814 _Py_error_handler error_handler;
15815 error_handler = get_error_handler_wide(config->filesystem_errors);
15816 if (error_handler == _Py_ERROR_UNKNOWN) {
15817 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15818 return -1;
15819 }
15820
15821 char *encoding, *errors;
15822 if (encode_wstr_utf8(config->filesystem_encoding,
15823 &encoding,
15824 "filesystem_encoding") < 0) {
15825 return -1;
15826 }
15827
15828 if (encode_wstr_utf8(config->filesystem_errors,
15829 &errors,
15830 "filesystem_errors") < 0) {
15831 PyMem_RawFree(encoding);
15832 return -1;
15833 }
15834
15835 PyMem_RawFree(interp->fs_codec.encoding);
15836 interp->fs_codec.encoding = encoding;
15837 PyMem_RawFree(interp->fs_codec.errors);
15838 interp->fs_codec.errors = errors;
15839 interp->fs_codec.error_handler = error_handler;
15840
15841 /* At this point, PyUnicode_EncodeFSDefault() and
15842 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15843 the C implementation of the filesystem encoding. */
15844
15845 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15846 global configuration variables. */
15847 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15848 interp->fs_codec.errors) < 0) {
15849 PyErr_NoMemory();
15850 return -1;
15851 }
15852 return 0;
15853}
15854
15855
Victor Stinner331a6a52019-05-27 16:39:22 +020015856static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015857init_fs_encoding(PyInterpreterState *interp)
15858{
Victor Stinner709d23d2019-05-02 14:56:30 -040015859 /* Update the filesystem encoding to the normalized Python codec name.
15860 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15861 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015862 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015863 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015864 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015865 "of the filesystem encoding");
15866 }
15867
Victor Stinner709d23d2019-05-02 14:56:30 -040015868 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015869 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015870 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015871 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015872}
15873
15874
Victor Stinner331a6a52019-05-27 16:39:22 +020015875PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015876_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015877{
Victor Stinnerb45d2592019-06-20 00:05:23 +020015878 PyInterpreterState *interp = tstate->interp;
15879
Victor Stinner331a6a52019-05-27 16:39:22 +020015880 PyStatus status = init_fs_encoding(interp);
15881 if (_PyStatus_EXCEPTION(status)) {
15882 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015883 }
15884
15885 return init_stdio_encoding(interp);
15886}
15887
15888
Victor Stinner709d23d2019-05-02 14:56:30 -040015889#ifdef MS_WINDOWS
15890int
15891_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15892{
15893 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015894 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015895
15896 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15897 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15898 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15899 if (encoding == NULL || errors == NULL) {
15900 PyMem_RawFree(encoding);
15901 PyMem_RawFree(errors);
15902 PyErr_NoMemory();
15903 return -1;
15904 }
15905
15906 PyMem_RawFree(config->filesystem_encoding);
15907 config->filesystem_encoding = encoding;
15908 PyMem_RawFree(config->filesystem_errors);
15909 config->filesystem_errors = errors;
15910
15911 return init_fs_codec(interp);
15912}
15913#endif
15914
15915
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015916void
15917_PyUnicode_Fini(void)
15918{
15919#if defined(WITH_VALGRIND) || defined(__INSURE__)
15920 /* Insure++ is a memory analysis tool that aids in discovering
15921 * memory leaks and other memory problems. On Python exit, the
15922 * interned string dictionaries are flagged as being in use at exit
15923 * (which it is). Under normal circumstances, this is fine because
15924 * the memory will be automatically reclaimed by the system. Under
15925 * memory debugging, it's a huge source of useless noise, so we
15926 * trade off slower shutdown for less distraction in the memory
15927 * reports. -baw
15928 */
15929 unicode_release_interned();
15930#endif /* __INSURE__ */
15931
15932 Py_CLEAR(unicode_empty);
15933
15934 for (Py_ssize_t i = 0; i < 256; i++) {
15935 Py_CLEAR(unicode_latin1[i]);
15936 }
15937 _PyUnicode_ClearStaticStrings();
15938 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015939
15940 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15941 PyMem_RawFree(interp->fs_codec.encoding);
15942 interp->fs_codec.encoding = NULL;
15943 PyMem_RawFree(interp->fs_codec.errors);
15944 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015945}
15946
15947
Georg Brandl66c221e2010-10-14 07:04:07 +000015948/* A _string module, to export formatter_parser and formatter_field_name_split
15949 to the string.Formatter class implemented in Python. */
15950
15951static PyMethodDef _string_methods[] = {
15952 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15953 METH_O, PyDoc_STR("split the argument as a field name")},
15954 {"formatter_parser", (PyCFunction) formatter_parser,
15955 METH_O, PyDoc_STR("parse the argument as a format string")},
15956 {NULL, NULL}
15957};
15958
15959static struct PyModuleDef _string_module = {
15960 PyModuleDef_HEAD_INIT,
15961 "_string",
15962 PyDoc_STR("string helper module"),
15963 0,
15964 _string_methods,
15965 NULL,
15966 NULL,
15967 NULL,
15968 NULL
15969};
15970
15971PyMODINIT_FUNC
15972PyInit__string(void)
15973{
15974 return PyModule_Create(&_string_module);
15975}
15976
15977
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015978#ifdef __cplusplus
15979}
15980#endif