blob: 7c8bc06252a1ec1c57ada4ee45d223cda6493be7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020044#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040047#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010048#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000049#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050050#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
175 const from_type *_iter = (from_type *)(begin); \
176 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
968 else
969 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
974 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
975 else
976 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
979 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
980 else
981 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001040#ifdef Py_REF_DEBUG
1041 _Py_RefTotal--;
1042#endif
1043#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001044 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001045#endif
Victor Stinner84def372011-12-11 20:04:56 +01001046
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001047 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001048 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001049 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 PyErr_NoMemory();
1051 return NULL;
1052 }
Victor Stinner84def372011-12-11 20:04:56 +01001053 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001057 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001059 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001060 _PyUnicode_WSTR_LENGTH(unicode) = length;
1061 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001062 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1063 PyObject_DEL(_PyUnicode_WSTR(unicode));
1064 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001065 if (!PyUnicode_IS_ASCII(unicode))
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001067 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001068#ifdef Py_DEBUG
1069 unicode_fill_invalid(unicode, old_length);
1070#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1072 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001073 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 return unicode;
1075}
1076
Alexander Belopolsky40018472011-02-26 01:02:56 +00001077static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079{
Victor Stinner95663112011-10-04 01:03:50 +02001080 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001081 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001084
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 if (PyUnicode_IS_READY(unicode)) {
1086 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001087 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001089#ifdef Py_DEBUG
1090 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1091#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092
1093 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001094 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001095 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1096 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097
1098 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1099 PyErr_NoMemory();
1100 return -1;
1101 }
1102 new_size = (length + 1) * char_size;
1103
Victor Stinner7a9105a2011-12-12 00:13:42 +01001104 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1105 {
1106 PyObject_DEL(_PyUnicode_UTF8(unicode));
1107 _PyUnicode_UTF8(unicode) = NULL;
1108 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1109 }
1110
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 data = (PyObject *)PyObject_REALLOC(data, new_size);
1112 if (data == NULL) {
1113 PyErr_NoMemory();
1114 return -1;
1115 }
1116 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001117 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_WSTR_LENGTH(unicode) = length;
1120 }
1121 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001122 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 _PyUnicode_UTF8_LENGTH(unicode) = length;
1124 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_LENGTH(unicode) = length;
1126 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001127#ifdef Py_DEBUG
1128 unicode_fill_invalid(unicode, old_length);
1129#endif
Victor Stinner95663112011-10-04 01:03:50 +02001130 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001131 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001133 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 }
Victor Stinner95663112011-10-04 01:03:50 +02001135 assert(_PyUnicode_WSTR(unicode) != NULL);
1136
1137 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001138 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001139 PyErr_NoMemory();
1140 return -1;
1141 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001142 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001143 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001144 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001145 if (!wstr) {
1146 PyErr_NoMemory();
1147 return -1;
1148 }
1149 _PyUnicode_WSTR(unicode) = wstr;
1150 _PyUnicode_WSTR(unicode)[length] = 0;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001152 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return 0;
1154}
1155
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156static PyObject*
1157resize_copy(PyObject *unicode, Py_ssize_t length)
1158{
1159 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001160 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001162
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001163 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001164
1165 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1166 if (copy == NULL)
1167 return NULL;
1168
1169 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001170 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001172 }
1173 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001174 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001175
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001176 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 if (w == NULL)
1178 return NULL;
1179 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1180 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001181 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001182 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 }
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001188 Ux0000 terminated; some code (e.g. new_identifier)
1189 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001192 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193
1194*/
1195
Alexander Belopolsky40018472011-02-26 01:02:56 +00001196static PyUnicodeObject *
1197_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001199 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
Thomas Wouters477c8d52006-05-27 19:21:47 +00001202 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 if (length == 0 && unicode_empty != NULL) {
1204 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001205 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 }
1207
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001208 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001209 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001210 return (PyUnicodeObject *)PyErr_NoMemory();
1211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 if (length < 0) {
1213 PyErr_SetString(PyExc_SystemError,
1214 "Negative size passed to _PyUnicode_New");
1215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1219 if (unicode == NULL)
1220 return NULL;
1221 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001222
1223 _PyUnicode_WSTR_LENGTH(unicode) = length;
1224 _PyUnicode_HASH(unicode) = -1;
1225 _PyUnicode_STATE(unicode).interned = 0;
1226 _PyUnicode_STATE(unicode).kind = 0;
1227 _PyUnicode_STATE(unicode).compact = 0;
1228 _PyUnicode_STATE(unicode).ready = 0;
1229 _PyUnicode_STATE(unicode).ascii = 0;
1230 _PyUnicode_DATA_ANY(unicode) = NULL;
1231 _PyUnicode_LENGTH(unicode) = 0;
1232 _PyUnicode_UTF8(unicode) = NULL;
1233 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1236 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001237 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001239 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241
Jeremy Hyltond8082792003-09-16 19:41:39 +00001242 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001243 * the caller fails before initializing str -- unicode_resize()
1244 * reads str[0], and the Keep-Alive optimization can keep memory
1245 * allocated for str alive across a call to unicode_dealloc(unicode).
1246 * We don't want unicode_resize to read uninitialized memory in
1247 * that case.
1248 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_WSTR(unicode)[0] = 0;
1250 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001251
Victor Stinner7931d9a2011-11-04 00:22:48 +01001252 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 return unicode;
1254}
1255
Victor Stinnerf42dc442011-10-02 23:33:16 +02001256static const char*
1257unicode_kind_name(PyObject *unicode)
1258{
Victor Stinner42dfd712011-10-03 14:41:45 +02001259 /* don't check consistency: unicode_kind_name() is called from
1260 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001261 if (!PyUnicode_IS_COMPACT(unicode))
1262 {
1263 if (!PyUnicode_IS_READY(unicode))
1264 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001265 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266 {
1267 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001268 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001269 return "legacy ascii";
1270 else
1271 return "legacy latin1";
1272 case PyUnicode_2BYTE_KIND:
1273 return "legacy UCS2";
1274 case PyUnicode_4BYTE_KIND:
1275 return "legacy UCS4";
1276 default:
1277 return "<legacy invalid kind>";
1278 }
1279 }
1280 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001281 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "ascii";
1285 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001288 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 default:
1292 return "<invalid compact kind>";
1293 }
1294}
1295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001298char *_PyUnicode_utf8(void *unicode_raw){
1299 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001300 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_compact_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return _PyUnicode_COMPACT_DATA(unicode);
1306}
Victor Stinnera42de742018-11-22 10:25:22 +01001307void *_PyUnicode_data(void *unicode_raw) {
1308 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001309 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1311 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1312 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1313 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1314 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1315 return PyUnicode_DATA(unicode);
1316}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001317
1318void
1319_PyUnicode_Dump(PyObject *op)
1320{
1321 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1324 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001325
Victor Stinnera849a4b2011-10-03 12:12:11 +02001326 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001327 {
1328 if (ascii->state.ascii)
1329 data = (ascii + 1);
1330 else
1331 data = (compact + 1);
1332 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 else
1334 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1336 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 if (ascii->wstr == data)
1339 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001340 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001341
Victor Stinnera3b334d2011-10-03 13:53:37 +02001342 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001343 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1345 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001346 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001348 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001350}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351#endif
1352
1353PyObject *
1354PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1355{
1356 PyObject *obj;
1357 PyCompactUnicodeObject *unicode;
1358 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001359 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001360 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 Py_ssize_t char_size;
1362 Py_ssize_t struct_size;
1363
1364 /* Optimization for empty strings */
1365 if (size == 0 && unicode_empty != NULL) {
1366 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001367 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
1369
Victor Stinner9e9d6892011-10-04 01:02:02 +02001370 is_ascii = 0;
1371 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 struct_size = sizeof(PyCompactUnicodeObject);
1373 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001374 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 char_size = 1;
1376 is_ascii = 1;
1377 struct_size = sizeof(PyASCIIObject);
1378 }
1379 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 1;
1382 }
1383 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 2;
1386 if (sizeof(wchar_t) == 2)
1387 is_sharing = 1;
1388 }
1389 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001390 if (maxchar > MAX_UNICODE) {
1391 PyErr_SetString(PyExc_SystemError,
1392 "invalid maximum character passed to PyUnicode_New");
1393 return NULL;
1394 }
Victor Stinner8f825062012-04-27 13:55:39 +02001395 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 char_size = 4;
1397 if (sizeof(wchar_t) == 4)
1398 is_sharing = 1;
1399 }
1400
1401 /* Ensure we won't overflow the size. */
1402 if (size < 0) {
1403 PyErr_SetString(PyExc_SystemError,
1404 "Negative size passed to PyUnicode_New");
1405 return NULL;
1406 }
1407 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1408 return PyErr_NoMemory();
1409
1410 /* Duplicated allocation code from _PyObject_New() instead of a call to
1411 * PyObject_New() so we are able to allocate space for the object and
1412 * it's data buffer.
1413 */
1414 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1415 if (obj == NULL)
1416 return PyErr_NoMemory();
1417 obj = PyObject_INIT(obj, &PyUnicode_Type);
1418 if (obj == NULL)
1419 return NULL;
1420
1421 unicode = (PyCompactUnicodeObject *)obj;
1422 if (is_ascii)
1423 data = ((PyASCIIObject*)obj) + 1;
1424 else
1425 data = unicode + 1;
1426 _PyUnicode_LENGTH(unicode) = size;
1427 _PyUnicode_HASH(unicode) = -1;
1428 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001429 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 _PyUnicode_STATE(unicode).compact = 1;
1431 _PyUnicode_STATE(unicode).ready = 1;
1432 _PyUnicode_STATE(unicode).ascii = is_ascii;
1433 if (is_ascii) {
1434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 }
Victor Stinner8f825062012-04-27 13:55:39 +02001437 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 ((char*)data)[size] = 0;
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 else {
1445 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001446 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001447 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001449 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 ((Py_UCS4*)data)[size] = 0;
1451 if (is_sharing) {
1452 _PyUnicode_WSTR_LENGTH(unicode) = size;
1453 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1454 }
1455 else {
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 }
1459 }
Victor Stinner8f825062012-04-27 13:55:39 +02001460#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001461 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001462#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001463 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 return obj;
1465}
1466
1467#if SIZEOF_WCHAR_T == 2
1468/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1469 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001470 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 This function assumes that unicode can hold one more code point than wstr
1473 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001474static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477{
1478 const wchar_t *iter;
1479 Py_UCS4 *ucs4_out;
1480
Victor Stinner910337b2011-10-03 03:20:16 +02001481 assert(unicode != NULL);
1482 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1484 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1485
1486 for (iter = begin; iter < end; ) {
1487 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1488 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001489 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1490 && (iter+1) < end
1491 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 {
Victor Stinner551ac952011-11-29 22:58:13 +01001493 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 iter += 2;
1495 }
1496 else {
1497 *ucs4_out++ = *iter;
1498 iter++;
1499 }
1500 }
1501 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1502 _PyUnicode_GET_LENGTH(unicode)));
1503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504}
1505#endif
1506
Victor Stinnercd9950f2011-10-02 00:34:53 +02001507static int
Victor Stinner488fa492011-12-12 00:01:39 +01001508unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509{
Victor Stinner488fa492011-12-12 00:01:39 +01001510 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001511 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001512 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001513 return -1;
1514 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515 return 0;
1516}
1517
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518static int
1519_copy_characters(PyObject *to, Py_ssize_t to_start,
1520 PyObject *from, Py_ssize_t from_start,
1521 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001523 unsigned int from_kind, to_kind;
1524 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinneree4544c2012-05-09 22:24:08 +02001526 assert(0 <= how_many);
1527 assert(0 <= from_start);
1528 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001531 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinnerd3f08822012-05-29 12:57:52 +02001533 assert(PyUnicode_Check(to));
1534 assert(PyUnicode_IS_READY(to));
1535 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1536
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001537 if (how_many == 0)
1538 return 0;
1539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001541 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001543 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerf1852262012-06-16 16:38:26 +02001545#ifdef Py_DEBUG
1546 if (!check_maxchar
1547 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1548 {
1549 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1550 Py_UCS4 ch;
1551 Py_ssize_t i;
1552 for (i=0; i < how_many; i++) {
1553 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1554 assert(ch <= to_maxchar);
1555 }
1556 }
1557#endif
1558
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001560 if (check_maxchar
1561 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1562 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001563 /* Writing Latin-1 characters into an ASCII string requires to
1564 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001565 Py_UCS4 max_char;
1566 max_char = ucs1lib_find_max_char(from_data,
1567 (Py_UCS1*)from_data + how_many);
1568 if (max_char >= 128)
1569 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 }
Christian Heimesf051e432016-09-13 20:22:02 +02001571 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001572 (char*)from_data + from_kind * from_start,
1573 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001575 else if (from_kind == PyUnicode_1BYTE_KIND
1576 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001577 {
1578 _PyUnicode_CONVERT_BYTES(
1579 Py_UCS1, Py_UCS2,
1580 PyUnicode_1BYTE_DATA(from) + from_start,
1581 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1582 PyUnicode_2BYTE_DATA(to) + to_start
1583 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001584 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001585 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001586 && to_kind == PyUnicode_4BYTE_KIND)
1587 {
1588 _PyUnicode_CONVERT_BYTES(
1589 Py_UCS1, Py_UCS4,
1590 PyUnicode_1BYTE_DATA(from) + from_start,
1591 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1592 PyUnicode_4BYTE_DATA(to) + to_start
1593 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 }
1595 else if (from_kind == PyUnicode_2BYTE_KIND
1596 && to_kind == PyUnicode_4BYTE_KIND)
1597 {
1598 _PyUnicode_CONVERT_BYTES(
1599 Py_UCS2, Py_UCS4,
1600 PyUnicode_2BYTE_DATA(from) + from_start,
1601 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1602 PyUnicode_4BYTE_DATA(to) + to_start
1603 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001604 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001605 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001606 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1607
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001608 if (!check_maxchar) {
1609 if (from_kind == PyUnicode_2BYTE_KIND
1610 && to_kind == PyUnicode_1BYTE_KIND)
1611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS2, Py_UCS1,
1614 PyUnicode_2BYTE_DATA(from) + from_start,
1615 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_1BYTE_DATA(to) + to_start
1617 );
1618 }
1619 else if (from_kind == PyUnicode_4BYTE_KIND
1620 && to_kind == PyUnicode_1BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS4, Py_UCS1,
1624 PyUnicode_4BYTE_DATA(from) + from_start,
1625 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_1BYTE_DATA(to) + to_start
1627 );
1628 }
1629 else if (from_kind == PyUnicode_4BYTE_KIND
1630 && to_kind == PyUnicode_2BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS4, Py_UCS2,
1634 PyUnicode_4BYTE_DATA(from) + from_start,
1635 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_2BYTE_DATA(to) + to_start
1637 );
1638 }
1639 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001640 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 }
1642 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001643 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 Py_ssize_t i;
1647
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 for (i=0; i < how_many; i++) {
1649 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 if (ch > to_maxchar)
1651 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1653 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 }
1655 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return 0;
1657}
1658
Victor Stinnerd3f08822012-05-29 12:57:52 +02001659void
1660_PyUnicode_FastCopyCharacters(
1661 PyObject *to, Py_ssize_t to_start,
1662 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001663{
1664 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1665}
1666
1667Py_ssize_t
1668PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1669 PyObject *from, Py_ssize_t from_start,
1670 Py_ssize_t how_many)
1671{
1672 int err;
1673
1674 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1675 PyErr_BadInternalCall();
1676 return -1;
1677 }
1678
Benjamin Petersonbac79492012-01-14 13:34:47 -05001679 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001680 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001681 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001682 return -1;
1683
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001689 PyErr_SetString(PyExc_IndexError, "string index out of range");
1690 return -1;
1691 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if (how_many < 0) {
1693 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1694 return -1;
1695 }
1696 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1698 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001699 "Cannot write %zi characters at %zi "
1700 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001701 how_many, to_start, PyUnicode_GET_LENGTH(to));
1702 return -1;
1703 }
1704
1705 if (how_many == 0)
1706 return 0;
1707
Victor Stinner488fa492011-12-12 00:01:39 +01001708 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
1710
1711 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1712 if (err) {
1713 PyErr_Format(PyExc_SystemError,
1714 "Cannot copy %s characters "
1715 "into a string of %s characters",
1716 unicode_kind_name(from),
1717 unicode_kind_name(to));
1718 return -1;
1719 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001720 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721}
1722
Victor Stinner17222162011-09-28 22:15:37 +02001723/* Find the maximum code point and count the number of surrogate pairs so a
1724 correct string length can be computed before converting a string to UCS4.
1725 This function counts single surrogates as a character and not as a pair.
1726
1727 Return 0 on success, or -1 on error. */
1728static int
1729find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1730 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731{
1732 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001733 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Victor Stinnerc53be962011-10-02 21:33:54 +02001735 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 *num_surrogates = 0;
1737 *maxchar = 0;
1738
1739 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001741 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1742 && (iter+1) < end
1743 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1744 {
1745 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1746 ++(*num_surrogates);
1747 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 }
1749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001751 {
1752 ch = *iter;
1753 iter++;
1754 }
1755 if (ch > *maxchar) {
1756 *maxchar = ch;
1757 if (*maxchar > MAX_UNICODE) {
1758 PyErr_Format(PyExc_ValueError,
1759 "character U+%x is not in range [U+0000; U+10ffff]",
1760 ch);
1761 return -1;
1762 }
1763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 }
1765 return 0;
1766}
1767
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001768int
1769_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770{
1771 wchar_t *end;
1772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774#if SIZEOF_WCHAR_T == 2
1775 Py_ssize_t length_wo_surrogates;
1776#endif
1777
Georg Brandl7597add2011-10-05 16:36:47 +02001778 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001779 strings were created using _PyObject_New() and where no canonical
1780 representation (the str field) has been set yet aka strings
1781 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001782 assert(_PyUnicode_CHECK(unicode));
1783 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 /* Actually, it should neither be interned nor be anything else: */
1788 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001791 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794
1795 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001796 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1797 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 PyErr_NoMemory();
1799 return -1;
1800 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001801 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 _PyUnicode_WSTR(unicode), end,
1803 PyUnicode_1BYTE_DATA(unicode));
1804 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1805 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1806 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1807 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001808 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001809 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 }
1812 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001813 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 _PyUnicode_UTF8(unicode) = NULL;
1815 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 }
1817 PyObject_FREE(_PyUnicode_WSTR(unicode));
1818 _PyUnicode_WSTR(unicode) = NULL;
1819 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1820 }
1821 /* In this case we might have to convert down from 4-byte native
1822 wchar_t to 2-byte unicode. */
1823 else if (maxchar < 65536) {
1824 assert(num_surrogates == 0 &&
1825 "FindMaxCharAndNumSurrogatePairs() messed up");
1826
Victor Stinner506f5922011-09-28 22:34:18 +02001827#if SIZEOF_WCHAR_T == 2
1828 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001830 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1831 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1832 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001835#else
1836 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001838 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001839 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyErr_NoMemory();
1841 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 }
Victor Stinner506f5922011-09-28 22:34:18 +02001843 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1844 _PyUnicode_WSTR(unicode), end,
1845 PyUnicode_2BYTE_DATA(unicode));
1846 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1847 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1848 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001849 _PyUnicode_UTF8(unicode) = NULL;
1850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001851 PyObject_FREE(_PyUnicode_WSTR(unicode));
1852 _PyUnicode_WSTR(unicode) = NULL;
1853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1854#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1857 else {
1858#if SIZEOF_WCHAR_T == 2
1859 /* in case the native representation is 2-bytes, we need to allocate a
1860 new normalized 4-byte version. */
1861 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001862 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1863 PyErr_NoMemory();
1864 return -1;
1865 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001866 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1867 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyErr_NoMemory();
1869 return -1;
1870 }
1871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001873 _PyUnicode_UTF8(unicode) = NULL;
1874 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001875 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1876 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001877 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 PyObject_FREE(_PyUnicode_WSTR(unicode));
1879 _PyUnicode_WSTR(unicode) = NULL;
1880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1881#else
1882 assert(num_surrogates == 0);
1883
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001886 _PyUnicode_UTF8(unicode) = NULL;
1887 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1889#endif
1890 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1891 }
1892 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return 0;
1895}
1896
Alexander Belopolsky40018472011-02-26 01:02:56 +00001897static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001898unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899{
Walter Dörwald16807132007-05-25 13:52:07 +00001900 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_NOT_INTERNED:
1902 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001903
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 case SSTATE_INTERNED_MORTAL:
1905 /* revive dead object temporarily for DelItem */
1906 Py_REFCNT(unicode) = 3;
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001907 if (PyDict_DelItem(interned, unicode) != 0) {
1908 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1909 NULL);
1910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1915 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001916
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001918 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001919 }
1920
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001921 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001923 }
1924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001925 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 }
1927 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001928 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001931 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932}
1933
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001934#ifdef Py_DEBUG
1935static int
1936unicode_is_singleton(PyObject *unicode)
1937{
1938 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1939 if (unicode == unicode_empty)
1940 return 1;
1941 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1942 {
1943 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1944 if (ch < 256 && unicode_latin1[ch] == unicode)
1945 return 1;
1946 }
1947 return 0;
1948}
1949#endif
1950
Alexander Belopolsky40018472011-02-26 01:02:56 +00001951static int
Victor Stinner488fa492011-12-12 00:01:39 +01001952unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001953{
Victor Stinner488fa492011-12-12 00:01:39 +01001954 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955 if (Py_REFCNT(unicode) != 1)
1956 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001957 if (_PyUnicode_HASH(unicode) != -1)
1958 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001959 if (PyUnicode_CHECK_INTERNED(unicode))
1960 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001961 if (!PyUnicode_CheckExact(unicode))
1962 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001963#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001964 /* singleton refcount is greater than 1 */
1965 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001966#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 return 1;
1968}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001969
Victor Stinnerfe226c02011-10-03 03:52:20 +02001970static int
1971unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1972{
1973 PyObject *unicode;
1974 Py_ssize_t old_length;
1975
1976 assert(p_unicode != NULL);
1977 unicode = *p_unicode;
1978
1979 assert(unicode != NULL);
1980 assert(PyUnicode_Check(unicode));
1981 assert(0 <= length);
1982
Victor Stinner910337b2011-10-03 03:20:16 +02001983 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 old_length = PyUnicode_WSTR_LENGTH(unicode);
1985 else
1986 old_length = PyUnicode_GET_LENGTH(unicode);
1987 if (old_length == length)
1988 return 0;
1989
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001990 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001991 _Py_INCREF_UNICODE_EMPTY();
1992 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001994 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001995 return 0;
1996 }
1997
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001999 PyObject *copy = resize_copy(unicode, length);
2000 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004 }
2005
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002007 PyObject *new_unicode = resize_compact(unicode, length);
2008 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002010 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002013 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002014}
2015
Alexander Belopolsky40018472011-02-26 01:02:56 +00002016int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002018{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 PyObject *unicode;
2020 if (p_unicode == NULL) {
2021 PyErr_BadInternalCall();
2022 return -1;
2023 }
2024 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 {
2027 PyErr_BadInternalCall();
2028 return -1;
2029 }
2030 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002031}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002032
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002033/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002034
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002035 WARNING: The function doesn't copy the terminating null character and
2036 doesn't check the maximum character (may write a latin1 character in an
2037 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002038static void
2039unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2040 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002041{
2042 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2043 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002044 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002045
2046 switch (kind) {
2047 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002048 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002049#ifdef Py_DEBUG
2050 if (PyUnicode_IS_ASCII(unicode)) {
2051 Py_UCS4 maxchar = ucs1lib_find_max_char(
2052 (const Py_UCS1*)str,
2053 (const Py_UCS1*)str + len);
2054 assert(maxchar < 128);
2055 }
2056#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002057 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002058 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002059 }
2060 case PyUnicode_2BYTE_KIND: {
2061 Py_UCS2 *start = (Py_UCS2 *)data + index;
2062 Py_UCS2 *ucs2 = start;
2063 assert(index <= PyUnicode_GET_LENGTH(unicode));
2064
Victor Stinner184252a2012-06-16 02:57:41 +02002065 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 *ucs2 = (Py_UCS2)*str;
2067
2068 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002069 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002070 }
2071 default: {
2072 Py_UCS4 *start = (Py_UCS4 *)data + index;
2073 Py_UCS4 *ucs4 = start;
2074 assert(kind == PyUnicode_4BYTE_KIND);
2075 assert(index <= PyUnicode_GET_LENGTH(unicode));
2076
Victor Stinner184252a2012-06-16 02:57:41 +02002077 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002078 *ucs4 = (Py_UCS4)*str;
2079
2080 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002081 }
2082 }
2083}
2084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085static PyObject*
2086get_latin1_char(unsigned char ch)
2087{
Victor Stinnera464fc12011-10-02 20:39:30 +02002088 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002090 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (!unicode)
2092 return NULL;
2093 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002094 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 unicode_latin1[ch] = unicode;
2096 }
2097 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099}
2100
Victor Stinner985a82a2014-01-03 12:53:47 +01002101static PyObject*
2102unicode_char(Py_UCS4 ch)
2103{
2104 PyObject *unicode;
2105
2106 assert(ch <= MAX_UNICODE);
2107
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002108 if (ch < 256)
2109 return get_latin1_char(ch);
2110
Victor Stinner985a82a2014-01-03 12:53:47 +01002111 unicode = PyUnicode_New(1, ch);
2112 if (unicode == NULL)
2113 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002114
2115 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2116 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002117 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002118 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2120 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2121 }
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
2124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002129 if (u == NULL)
2130 return (PyObject*)_PyUnicode_New(size);
2131
2132 if (size < 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 return PyUnicode_FromWideChar(u, size);
2138}
2139
2140PyObject *
2141PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002143 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 Py_UCS4 maxchar = 0;
2145 Py_ssize_t num_surrogates;
2146
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002147 if (u == NULL && size != 0) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
2151
2152 if (size == -1) {
2153 size = wcslen(u);
2154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 /* If the Unicode data is known at construction time, we can apply
2157 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 /* Single character Unicode objects in the Latin-1 range are
2164 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002165 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return get_latin1_char((unsigned char)*u);
2167
2168 /* If not empty and not single character, copy the Unicode data
2169 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002170 if (find_maxchar_surrogates(u, u + size,
2171 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return NULL;
2173
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!unicode)
2176 return NULL;
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 switch (PyUnicode_KIND(unicode)) {
2179 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002180 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2182 break;
2183 case PyUnicode_2BYTE_KIND:
2184#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002185 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002187 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2189#endif
2190 break;
2191 case PyUnicode_4BYTE_KIND:
2192#if SIZEOF_WCHAR_T == 2
2193 /* This is the only case which has to process surrogates, thus
2194 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002195 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196#else
2197 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002198 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199#endif
2200 break;
2201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002202 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206}
2207
Alexander Belopolsky40018472011-02-26 01:02:56 +00002208PyObject *
2209PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002210{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 if (size < 0) {
2212 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 return NULL;
2215 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002216 if (u != NULL)
2217 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2218 else
2219 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002220}
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224{
2225 size_t size = strlen(u);
2226 if (size > PY_SSIZE_T_MAX) {
2227 PyErr_SetString(PyExc_OverflowError, "input too long");
2228 return NULL;
2229 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002230 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231}
2232
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002233PyObject *
2234_PyUnicode_FromId(_Py_Identifier *id)
2235{
2236 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002237 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2238 strlen(id->string),
2239 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002240 if (!id->object)
2241 return NULL;
2242 PyUnicode_InternInPlace(&id->object);
2243 assert(!id->next);
2244 id->next = static_strings;
2245 static_strings = id;
2246 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002247 return id->object;
2248}
2249
2250void
2251_PyUnicode_ClearStaticStrings()
2252{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002253 _Py_Identifier *tmp, *s = static_strings;
2254 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002255 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002256 tmp = s->next;
2257 s->next = NULL;
2258 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002259 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002260 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002261}
2262
Benjamin Peterson0df54292012-03-26 14:50:32 -04002263/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002264
Victor Stinnerd3f08822012-05-29 12:57:52 +02002265PyObject*
2266_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002267{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002268 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002269 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002270 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002271#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002272 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002273#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002274 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002275 }
Victor Stinner785938e2011-12-11 20:09:03 +01002276 unicode = PyUnicode_New(size, 127);
2277 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002278 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002279 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2280 assert(_PyUnicode_CheckConsistency(unicode, 1));
2281 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002282}
2283
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002284static Py_UCS4
2285kind_maxchar_limit(unsigned int kind)
2286{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002287 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002288 case PyUnicode_1BYTE_KIND:
2289 return 0x80;
2290 case PyUnicode_2BYTE_KIND:
2291 return 0x100;
2292 case PyUnicode_4BYTE_KIND:
2293 return 0x10000;
2294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002295 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 }
2297}
2298
Victor Stinner702c7342011-10-05 13:50:52 +02002299static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002300_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002303 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002304
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 if (size == 0)
2306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002307 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002308 if (size == 1)
2309 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 if (!res)
2314 return NULL;
2315 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002316 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318}
2319
Victor Stinnere57b1c02011-09-28 22:20:48 +02002320static PyObject*
2321_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322{
2323 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002324 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002325
Serhiy Storchaka678db842013-01-26 12:16:36 +02002326 if (size == 0)
2327 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002328 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002329 if (size == 1)
2330 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002331
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 if (!res)
2335 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002338 else {
2339 _PyUnicode_CONVERT_BYTES(
2340 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2341 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002342 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 return res;
2344}
2345
Victor Stinnere57b1c02011-09-28 22:20:48 +02002346static PyObject*
2347_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348{
2349 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351
Serhiy Storchaka678db842013-01-26 12:16:36 +02002352 if (size == 0)
2353 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002354 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002355 if (size == 1)
2356 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 if (!res)
2361 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002362 if (max_char < 256)
2363 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2364 PyUnicode_1BYTE_DATA(res));
2365 else if (max_char < 0x10000)
2366 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2367 PyUnicode_2BYTE_DATA(res));
2368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002370 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return res;
2372}
2373
2374PyObject*
2375PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002377 if (size < 0) {
2378 PyErr_SetString(PyExc_ValueError, "size must be positive");
2379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002383 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002385 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002387 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002388 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002389 PyErr_SetString(PyExc_SystemError, "invalid kind");
2390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392}
2393
Victor Stinnerece58de2012-04-23 23:36:38 +02002394Py_UCS4
2395_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396{
2397 enum PyUnicode_Kind kind;
2398 void *startptr, *endptr;
2399
2400 assert(PyUnicode_IS_READY(unicode));
2401 assert(0 <= start);
2402 assert(end <= PyUnicode_GET_LENGTH(unicode));
2403 assert(start <= end);
2404
2405 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2406 return PyUnicode_MAX_CHAR_VALUE(unicode);
2407
2408 if (start == end)
2409 return 127;
2410
Victor Stinner94d558b2012-04-27 22:26:58 +02002411 if (PyUnicode_IS_ASCII(unicode))
2412 return 127;
2413
Victor Stinnerece58de2012-04-23 23:36:38 +02002414 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002415 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002416 endptr = (char *)startptr + end * kind;
2417 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002418 switch(kind) {
2419 case PyUnicode_1BYTE_KIND:
2420 return ucs1lib_find_max_char(startptr, endptr);
2421 case PyUnicode_2BYTE_KIND:
2422 return ucs2lib_find_max_char(startptr, endptr);
2423 case PyUnicode_4BYTE_KIND:
2424 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002425 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002426 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002427 }
2428}
2429
Victor Stinner25a4b292011-10-06 12:31:55 +02002430/* Ensure that a string uses the most efficient storage, if it is not the
2431 case: create a new string with of the right kind. Write NULL into *p_unicode
2432 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002433static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002434unicode_adjust_maxchar(PyObject **p_unicode)
2435{
2436 PyObject *unicode, *copy;
2437 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002438 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 unsigned int kind;
2440
2441 assert(p_unicode != NULL);
2442 unicode = *p_unicode;
2443 assert(PyUnicode_IS_READY(unicode));
2444 if (PyUnicode_IS_ASCII(unicode))
2445 return;
2446
2447 len = PyUnicode_GET_LENGTH(unicode);
2448 kind = PyUnicode_KIND(unicode);
2449 if (kind == PyUnicode_1BYTE_KIND) {
2450 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002451 max_char = ucs1lib_find_max_char(u, u + len);
2452 if (max_char >= 128)
2453 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002454 }
2455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002457 max_char = ucs2lib_find_max_char(u, u + len);
2458 if (max_char >= 256)
2459 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 }
2461 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002462 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002464 max_char = ucs4lib_find_max_char(u, u + len);
2465 if (max_char >= 0x10000)
2466 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002467 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002468 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002469 if (copy != NULL)
2470 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002471 Py_DECREF(unicode);
2472 *p_unicode = copy;
2473}
2474
Victor Stinner034f6cf2011-09-30 02:26:44 +02002475PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002476_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477{
Victor Stinner87af4f22011-11-21 23:03:47 +01002478 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002479 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002480
Victor Stinner034f6cf2011-09-30 02:26:44 +02002481 if (!PyUnicode_Check(unicode)) {
2482 PyErr_BadInternalCall();
2483 return NULL;
2484 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002485 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002486 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002487
Victor Stinner87af4f22011-11-21 23:03:47 +01002488 length = PyUnicode_GET_LENGTH(unicode);
2489 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490 if (!copy)
2491 return NULL;
2492 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2493
Christian Heimesf051e432016-09-13 20:22:02 +02002494 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002495 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002496 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002497 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500
Victor Stinnerbc603d12011-10-02 01:00:40 +02002501/* Widen Unicode objects to larger buffers. Don't write terminating null
2502 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503
2504void*
2505_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2506{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002507 Py_ssize_t len;
2508 void *result;
2509 unsigned int skind;
2510
Benjamin Petersonbac79492012-01-14 13:34:47 -05002511 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002512 return NULL;
2513
2514 len = PyUnicode_GET_LENGTH(s);
2515 skind = PyUnicode_KIND(s);
2516 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002517 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 return NULL;
2519 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002520 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 if (!result)
2524 return PyErr_NoMemory();
2525 assert(skind == PyUnicode_1BYTE_KIND);
2526 _PyUnicode_CONVERT_BYTES(
2527 Py_UCS1, Py_UCS2,
2528 PyUnicode_1BYTE_DATA(s),
2529 PyUnicode_1BYTE_DATA(s) + len,
2530 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002533 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 if (!result)
2535 return PyErr_NoMemory();
2536 if (skind == PyUnicode_2BYTE_KIND) {
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS2, Py_UCS4,
2539 PyUnicode_2BYTE_DATA(s),
2540 PyUnicode_2BYTE_DATA(s) + len,
2541 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 else {
2544 assert(skind == PyUnicode_1BYTE_KIND);
2545 _PyUnicode_CONVERT_BYTES(
2546 Py_UCS1, Py_UCS4,
2547 PyUnicode_1BYTE_DATA(s),
2548 PyUnicode_1BYTE_DATA(s) + len,
2549 result);
2550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552 default:
2553 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 }
Victor Stinner01698042011-10-04 00:04:26 +02002555 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 return NULL;
2557}
2558
2559static Py_UCS4*
2560as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2561 int copy_null)
2562{
2563 int kind;
2564 void *data;
2565 Py_ssize_t len, targetlen;
2566 if (PyUnicode_READY(string) == -1)
2567 return NULL;
2568 kind = PyUnicode_KIND(string);
2569 data = PyUnicode_DATA(string);
2570 len = PyUnicode_GET_LENGTH(string);
2571 targetlen = len;
2572 if (copy_null)
2573 targetlen++;
2574 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 if (!target) {
2577 PyErr_NoMemory();
2578 return NULL;
2579 }
2580 }
2581 else {
2582 if (targetsize < targetlen) {
2583 PyErr_Format(PyExc_SystemError,
2584 "string is longer than the buffer");
2585 if (copy_null && 0 < targetsize)
2586 target[0] = 0;
2587 return NULL;
2588 }
2589 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002590 if (kind == PyUnicode_1BYTE_KIND) {
2591 Py_UCS1 *start = (Py_UCS1 *) data;
2592 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002594 else if (kind == PyUnicode_2BYTE_KIND) {
2595 Py_UCS2 *start = (Py_UCS2 *) data;
2596 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2597 }
2598 else {
2599 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002600 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 if (copy_null)
2603 target[len] = 0;
2604 return target;
2605}
2606
2607Py_UCS4*
2608PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2609 int copy_null)
2610{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002611 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 PyErr_BadInternalCall();
2613 return NULL;
2614 }
2615 return as_ucs4(string, target, targetsize, copy_null);
2616}
2617
2618Py_UCS4*
2619PyUnicode_AsUCS4Copy(PyObject *string)
2620{
2621 return as_ucs4(string, NULL, 0, 1);
2622}
2623
Victor Stinner15a11362012-10-06 23:48:20 +02002624/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002625 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2626 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2627#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629static int
2630unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2631 Py_ssize_t width, Py_ssize_t precision)
2632{
2633 Py_ssize_t length, fill, arglen;
2634 Py_UCS4 maxchar;
2635
2636 if (PyUnicode_READY(str) == -1)
2637 return -1;
2638
2639 length = PyUnicode_GET_LENGTH(str);
2640 if ((precision == -1 || precision >= length)
2641 && width <= length)
2642 return _PyUnicodeWriter_WriteStr(writer, str);
2643
2644 if (precision != -1)
2645 length = Py_MIN(precision, length);
2646
2647 arglen = Py_MAX(length, width);
2648 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2649 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2650 else
2651 maxchar = writer->maxchar;
2652
2653 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2654 return -1;
2655
2656 if (width > length) {
2657 fill = width - length;
2658 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2659 return -1;
2660 writer->pos += fill;
2661 }
2662
2663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2664 str, 0, length);
2665 writer->pos += length;
2666 return 0;
2667}
2668
2669static int
Victor Stinner998b8062018-09-12 00:23:25 +02002670unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002671 Py_ssize_t width, Py_ssize_t precision)
2672{
2673 /* UTF-8 */
2674 Py_ssize_t length;
2675 PyObject *unicode;
2676 int res;
2677
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002678 if (precision == -1) {
2679 length = strlen(str);
2680 }
2681 else {
2682 length = 0;
2683 while (length < precision && str[length]) {
2684 length++;
2685 }
2686 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002687 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2688 if (unicode == NULL)
2689 return -1;
2690
2691 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2692 Py_DECREF(unicode);
2693 return res;
2694}
2695
Victor Stinner96865452011-03-01 23:44:09 +00002696static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002697unicode_fromformat_arg(_PyUnicodeWriter *writer,
2698 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002699{
Victor Stinnere215d962012-10-06 23:03:36 +02002700 const char *p;
2701 Py_ssize_t len;
2702 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002703 Py_ssize_t width;
2704 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002705 int longflag;
2706 int longlongflag;
2707 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002708 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002709
2710 p = f;
2711 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002712 zeropad = 0;
2713 if (*f == '0') {
2714 zeropad = 1;
2715 f++;
2716 }
Victor Stinner96865452011-03-01 23:44:09 +00002717
2718 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 width = -1;
2720 if (Py_ISDIGIT((unsigned)*f)) {
2721 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002722 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002723 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002724 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002725 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002727 return NULL;
2728 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002730 f++;
2731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732 }
2733 precision = -1;
2734 if (*f == '.') {
2735 f++;
2736 if (Py_ISDIGIT((unsigned)*f)) {
2737 precision = (*f - '0');
2738 f++;
2739 while (Py_ISDIGIT((unsigned)*f)) {
2740 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2741 PyErr_SetString(PyExc_ValueError,
2742 "precision too big");
2743 return NULL;
2744 }
2745 precision = (precision * 10) + (*f - '0');
2746 f++;
2747 }
2748 }
Victor Stinner96865452011-03-01 23:44:09 +00002749 if (*f == '%') {
2750 /* "%.3%s" => f points to "3" */
2751 f--;
2752 }
2753 }
2754 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002755 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002756 f--;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758
2759 /* Handle %ld, %lu, %lld and %llu. */
2760 longflag = 0;
2761 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002762 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002763 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002764 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002765 longflag = 1;
2766 ++f;
2767 }
Victor Stinner96865452011-03-01 23:44:09 +00002768 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002769 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002770 longlongflag = 1;
2771 f += 2;
2772 }
Victor Stinner96865452011-03-01 23:44:09 +00002773 }
2774 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002775 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002776 size_tflag = 1;
2777 ++f;
2778 }
Victor Stinnere215d962012-10-06 23:03:36 +02002779
2780 if (f[1] == '\0')
2781 writer->overallocate = 0;
2782
2783 switch (*f) {
2784 case 'c':
2785 {
2786 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002787 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002788 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002789 "character argument not in range(0x110000)");
2790 return NULL;
2791 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002792 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002793 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 break;
2795 }
2796
2797 case 'i':
2798 case 'd':
2799 case 'u':
2800 case 'x':
2801 {
2802 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002803 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002805
2806 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002807 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002808 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002809 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002810 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002811 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002812 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002813 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002814 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002815 va_arg(*vargs, size_t));
2816 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002817 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002818 va_arg(*vargs, unsigned int));
2819 }
2820 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002821 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002822 }
2823 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002825 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002826 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002827 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002828 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002829 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002830 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002831 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002832 va_arg(*vargs, Py_ssize_t));
2833 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002834 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002835 va_arg(*vargs, int));
2836 }
2837 assert(len >= 0);
2838
Victor Stinnere215d962012-10-06 23:03:36 +02002839 if (precision < len)
2840 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841
2842 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2844 return NULL;
2845
Victor Stinnere215d962012-10-06 23:03:36 +02002846 if (width > precision) {
2847 Py_UCS4 fillchar;
2848 fill = width - precision;
2849 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002850 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2851 return NULL;
2852 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 }
Victor Stinner15a11362012-10-06 23:48:20 +02002854 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002855 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002856 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2857 return NULL;
2858 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002859 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002860
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864 }
2865
2866 case 'p':
2867 {
2868 char number[MAX_LONG_LONG_CHARS];
2869
2870 len = sprintf(number, "%p", va_arg(*vargs, void*));
2871 assert(len >= 0);
2872
2873 /* %p is ill-defined: ensure leading 0x. */
2874 if (number[1] == 'X')
2875 number[1] = 'x';
2876 else if (number[1] != 'x') {
2877 memmove(number + 2, number,
2878 strlen(number) + 1);
2879 number[0] = '0';
2880 number[1] = 'x';
2881 len += 2;
2882 }
2883
Victor Stinner4a587072013-11-19 12:54:53 +01002884 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002885 return NULL;
2886 break;
2887 }
2888
2889 case 's':
2890 {
2891 /* UTF-8 */
2892 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002893 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002894 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002895 break;
2896 }
2897
2898 case 'U':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 assert(obj && _PyUnicode_CHECK(obj));
2902
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002904 return NULL;
2905 break;
2906 }
2907
2908 case 'V':
2909 {
2910 PyObject *obj = va_arg(*vargs, PyObject *);
2911 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002912 if (obj) {
2913 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return NULL;
2916 }
2917 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002919 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 }
2922 break;
2923 }
2924
2925 case 'S':
2926 {
2927 PyObject *obj = va_arg(*vargs, PyObject *);
2928 PyObject *str;
2929 assert(obj);
2930 str = PyObject_Str(obj);
2931 if (!str)
2932 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002933 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002934 Py_DECREF(str);
2935 return NULL;
2936 }
2937 Py_DECREF(str);
2938 break;
2939 }
2940
2941 case 'R':
2942 {
2943 PyObject *obj = va_arg(*vargs, PyObject *);
2944 PyObject *repr;
2945 assert(obj);
2946 repr = PyObject_Repr(obj);
2947 if (!repr)
2948 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002950 Py_DECREF(repr);
2951 return NULL;
2952 }
2953 Py_DECREF(repr);
2954 break;
2955 }
2956
2957 case 'A':
2958 {
2959 PyObject *obj = va_arg(*vargs, PyObject *);
2960 PyObject *ascii;
2961 assert(obj);
2962 ascii = PyObject_ASCII(obj);
2963 if (!ascii)
2964 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002965 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002966 Py_DECREF(ascii);
2967 return NULL;
2968 }
2969 Py_DECREF(ascii);
2970 break;
2971 }
2972
2973 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002974 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002975 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002976 break;
2977
2978 default:
2979 /* if we stumble upon an unknown formatting code, copy the rest
2980 of the format string to the output string. (we cannot just
2981 skip the code, since there's no way to know what's in the
2982 argument list) */
2983 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002984 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002985 return NULL;
2986 f = p+len;
2987 return f;
2988 }
2989
2990 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002991 return f;
2992}
2993
Walter Dörwaldd2034312007-05-18 16:29:38 +00002994PyObject *
2995PyUnicode_FromFormatV(const char *format, va_list vargs)
2996{
Victor Stinnere215d962012-10-06 23:03:36 +02002997 va_list vargs2;
2998 const char *f;
2999 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000
Victor Stinner8f674cc2013-04-17 23:02:17 +02003001 _PyUnicodeWriter_Init(&writer);
3002 writer.min_length = strlen(format) + 100;
3003 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003004
Benjamin Peterson0c212142016-09-20 20:39:33 -07003005 // Copy varags to be able to pass a reference to a subfunction.
3006 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003007
3008 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003009 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003010 f = unicode_fromformat_arg(&writer, f, &vargs2);
3011 if (f == NULL)
3012 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003015 const char *p;
3016 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003017
Victor Stinnere215d962012-10-06 23:03:36 +02003018 p = f;
3019 do
3020 {
3021 if ((unsigned char)*p > 127) {
3022 PyErr_Format(PyExc_ValueError,
3023 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3024 "string, got a non-ASCII byte: 0x%02x",
3025 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003026 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003027 }
3028 p++;
3029 }
3030 while (*p != '\0' && *p != '%');
3031 len = p - f;
3032
3033 if (*p == '\0')
3034 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003035
3036 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003037 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003038
3039 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003041 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003042 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003043 return _PyUnicodeWriter_Finish(&writer);
3044
3045 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003046 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003047 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003048 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003049}
3050
Walter Dörwaldd2034312007-05-18 16:29:38 +00003051PyObject *
3052PyUnicode_FromFormat(const char *format, ...)
3053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003054 PyObject* ret;
3055 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056
3057#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003059#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003061#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003062 ret = PyUnicode_FromFormatV(format, vargs);
3063 va_end(vargs);
3064 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003065}
3066
Serhiy Storchakac46db922018-10-23 22:58:24 +03003067static Py_ssize_t
3068unicode_get_widechar_size(PyObject *unicode)
3069{
3070 Py_ssize_t res;
3071
3072 assert(unicode != NULL);
3073 assert(_PyUnicode_CHECK(unicode));
3074
3075 if (_PyUnicode_WSTR(unicode) != NULL) {
3076 return PyUnicode_WSTR_LENGTH(unicode);
3077 }
3078 assert(PyUnicode_IS_READY(unicode));
3079
3080 res = _PyUnicode_LENGTH(unicode);
3081#if SIZEOF_WCHAR_T == 2
3082 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3083 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3084 const Py_UCS4 *end = s + res;
3085 for (; s < end; ++s) {
3086 if (*s > 0xFFFF) {
3087 ++res;
3088 }
3089 }
3090 }
3091#endif
3092 return res;
3093}
3094
3095static void
3096unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3097{
3098 const wchar_t *wstr;
3099
3100 assert(unicode != NULL);
3101 assert(_PyUnicode_CHECK(unicode));
3102
3103 wstr = _PyUnicode_WSTR(unicode);
3104 if (wstr != NULL) {
3105 memcpy(w, wstr, size * sizeof(wchar_t));
3106 return;
3107 }
3108 assert(PyUnicode_IS_READY(unicode));
3109
3110 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3111 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3112 for (; size--; ++s, ++w) {
3113 *w = *s;
3114 }
3115 }
3116 else {
3117#if SIZEOF_WCHAR_T == 4
3118 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3119 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3120 for (; size--; ++s, ++w) {
3121 *w = *s;
3122 }
3123#else
3124 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3125 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3126 for (; size--; ++s, ++w) {
3127 Py_UCS4 ch = *s;
3128 if (ch > 0xFFFF) {
3129 assert(ch <= MAX_UNICODE);
3130 /* encode surrogate pair in this case */
3131 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3132 if (!size--)
3133 break;
3134 *w = Py_UNICODE_LOW_SURROGATE(ch);
3135 }
3136 else {
3137 *w = ch;
3138 }
3139 }
3140#endif
3141 }
3142}
3143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144#ifdef HAVE_WCHAR_H
3145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003147
Victor Stinnerd88d9832011-09-06 02:00:05 +02003148 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003149 character) required to convert the unicode object. Ignore size argument.
3150
Victor Stinnerd88d9832011-09-06 02:00:05 +02003151 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003152 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003153 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003154Py_ssize_t
3155PyUnicode_AsWideChar(PyObject *unicode,
3156 wchar_t *w,
3157 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003158{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003159 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003161 if (unicode == NULL) {
3162 PyErr_BadInternalCall();
3163 return -1;
3164 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003168 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003169
3170 res = unicode_get_widechar_size(unicode);
3171 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003172 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003173 }
3174
3175 if (size > res) {
3176 size = res + 1;
3177 }
3178 else {
3179 res = size;
3180 }
3181 unicode_copy_as_widechar(unicode, w, size);
3182 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003183}
3184
Victor Stinner137c34c2010-09-29 10:25:54 +00003185wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003186PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003187 Py_ssize_t *size)
3188{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003190 Py_ssize_t buflen;
3191
3192 if (unicode == NULL) {
3193 PyErr_BadInternalCall();
3194 return NULL;
3195 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003198 return NULL;
3199 }
3200
Serhiy Storchakac46db922018-10-23 22:58:24 +03003201 buflen = unicode_get_widechar_size(unicode);
3202 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003203 if (buffer == NULL) {
3204 PyErr_NoMemory();
3205 return NULL;
3206 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3208 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003209 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003210 }
3211 else if (wcslen(buffer) != (size_t)buflen) {
3212 PyMem_FREE(buffer);
3213 PyErr_SetString(PyExc_ValueError,
3214 "embedded null character");
3215 return NULL;
3216 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003217 return buffer;
3218}
3219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003220#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221
Alexander Belopolsky40018472011-02-26 01:02:56 +00003222PyObject *
3223PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003224{
Victor Stinner8faf8212011-12-08 22:14:11 +01003225 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 PyErr_SetString(PyExc_ValueError,
3227 "chr() arg not in range(0x110000)");
3228 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003229 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003230
Victor Stinner985a82a2014-01-03 12:53:47 +01003231 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003235PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003237 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003239 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003240 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003241 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_INCREF(obj);
3243 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003244 }
3245 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 /* For a Unicode subtype that's not a Unicode object,
3247 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003248 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003249 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003250 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003251 "Can't convert '%.100s' object to str implicitly",
3252 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003253 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003257PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003260{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003261 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003262 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 PyErr_BadInternalCall();
3266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003268
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003269 /* Decoding bytes objects is the most common case and should be fast */
3270 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003271 if (PyBytes_GET_SIZE(obj) == 0) {
3272 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3273 return NULL;
3274 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003275 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003276 }
3277 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003278 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3279 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003280 }
3281
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003282 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 PyErr_SetString(PyExc_TypeError,
3284 "decoding str is not supported");
3285 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003286 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003287
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003288 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3289 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003291 "decoding to str: need a bytes-like object, %.80s found",
3292 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003293 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003294 }
Tim Petersced69f82003-09-16 20:30:58 +00003295
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003297 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003298 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3299 return NULL;
3300 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003301 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003303
Serhiy Storchaka05997252013-01-26 12:14:02 +02003304 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003305 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003306 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307}
3308
Victor Stinnerebe17e02016-10-12 13:57:45 +02003309/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3310 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3311 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003312int
3313_Py_normalize_encoding(const char *encoding,
3314 char *lower,
3315 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 char *l;
3319 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003320 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321
Victor Stinner942889a2016-09-05 15:40:10 -07003322 assert(encoding != NULL);
3323
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003324 e = encoding;
3325 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003326 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003327 punct = 0;
3328 while (1) {
3329 char c = *e;
3330 if (c == 0) {
3331 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
Victor Stinner942889a2016-09-05 15:40:10 -07003333
3334 if (Py_ISALNUM(c) || c == '.') {
3335 if (punct && l != lower) {
3336 if (l == l_end) {
3337 return 0;
3338 }
3339 *l++ = '_';
3340 }
3341 punct = 0;
3342
3343 if (l == l_end) {
3344 return 0;
3345 }
3346 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003347 }
3348 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003349 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003350 }
Victor Stinner942889a2016-09-05 15:40:10 -07003351
3352 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003353 }
3354 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003355 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003356}
3357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358PyObject *
3359PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003360 Py_ssize_t size,
3361 const char *encoding,
3362 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003363{
3364 PyObject *buffer = NULL, *unicode;
3365 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003366 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3367
Victor Stinner22eb6892019-06-26 00:51:05 +02003368 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3369 return NULL;
3370 }
3371
Victor Stinnered076ed2019-06-26 01:49:32 +02003372 if (size == 0) {
3373 _Py_RETURN_UNICODE_EMPTY();
3374 }
3375
Victor Stinner942889a2016-09-05 15:40:10 -07003376 if (encoding == NULL) {
3377 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3378 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003379
Fred Drakee4315f52000-05-09 19:53:39 +00003380 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003381 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3382 char *lower = buflower;
3383
3384 /* Fast paths */
3385 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3386 lower += 3;
3387 if (*lower == '_') {
3388 /* Match "utf8" and "utf_8" */
3389 lower++;
3390 }
3391
3392 if (lower[0] == '8' && lower[1] == 0) {
3393 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3394 }
3395 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3396 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3397 }
3398 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3399 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3400 }
3401 }
3402 else {
3403 if (strcmp(lower, "ascii") == 0
3404 || strcmp(lower, "us_ascii") == 0) {
3405 return PyUnicode_DecodeASCII(s, size, errors);
3406 }
Steve Dowercc16be82016-09-08 10:35:16 -07003407 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003408 else if (strcmp(lower, "mbcs") == 0) {
3409 return PyUnicode_DecodeMBCS(s, size, errors);
3410 }
3411 #endif
3412 else if (strcmp(lower, "latin1") == 0
3413 || strcmp(lower, "latin_1") == 0
3414 || strcmp(lower, "iso_8859_1") == 0
3415 || strcmp(lower, "iso8859_1") == 0) {
3416 return PyUnicode_DecodeLatin1(s, size, errors);
3417 }
3418 }
Victor Stinner37296e82010-06-10 13:36:23 +00003419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
3421 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003422 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003423 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003424 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003425 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 if (buffer == NULL)
3427 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003428 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 if (unicode == NULL)
3430 goto onError;
3431 if (!PyUnicode_Check(unicode)) {
3432 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003433 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003434 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003435 encoding,
3436 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 Py_DECREF(unicode);
3438 goto onError;
3439 }
3440 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003441 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 Py_XDECREF(buffer);
3445 return NULL;
3446}
3447
Alexander Belopolsky40018472011-02-26 01:02:56 +00003448PyObject *
3449PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003450 const char *encoding,
3451 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003452{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453 if (!PyUnicode_Check(unicode)) {
3454 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003455 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456 }
3457
Serhiy Storchaka00939072016-10-27 21:05:49 +03003458 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3459 "PyUnicode_AsDecodedObject() is deprecated; "
3460 "use PyCodec_Decode() to decode from str", 1) < 0)
3461 return NULL;
3462
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003465
3466 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003467 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003468}
3469
Alexander Belopolsky40018472011-02-26 01:02:56 +00003470PyObject *
3471PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003472 const char *encoding,
3473 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474{
3475 PyObject *v;
3476
3477 if (!PyUnicode_Check(unicode)) {
3478 PyErr_BadArgument();
3479 goto onError;
3480 }
3481
Serhiy Storchaka00939072016-10-27 21:05:49 +03003482 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3483 "PyUnicode_AsDecodedUnicode() is deprecated; "
3484 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3485 return NULL;
3486
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
3490 /* Decode via the codec registry */
3491 v = PyCodec_Decode(unicode, encoding, errors);
3492 if (v == NULL)
3493 goto onError;
3494 if (!PyUnicode_Check(v)) {
3495 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003496 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003497 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003498 encoding,
3499 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500 Py_DECREF(v);
3501 goto onError;
3502 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003503 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506 return NULL;
3507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 Py_ssize_t size,
3512 const char *encoding,
3513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514{
3515 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003516
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003517 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3521 Py_DECREF(unicode);
3522 return v;
3523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003529{
3530 PyObject *v;
3531
3532 if (!PyUnicode_Check(unicode)) {
3533 PyErr_BadArgument();
3534 goto onError;
3535 }
3536
Serhiy Storchaka00939072016-10-27 21:05:49 +03003537 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3538 "PyUnicode_AsEncodedObject() is deprecated; "
3539 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3540 "or PyCodec_Encode() for generic encoding", 1) < 0)
3541 return NULL;
3542
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003543 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003545
3546 /* Encode via the codec registry */
3547 v = PyCodec_Encode(unicode, encoding, errors);
3548 if (v == NULL)
3549 goto onError;
3550 return v;
3551
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003553 return NULL;
3554}
3555
Victor Stinner1b579672011-12-17 05:47:23 +01003556
Victor Stinner2cba6b82018-01-10 22:46:15 +01003557static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003558unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003559 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003560{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003561 Py_ssize_t wlen;
3562 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3563 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003564 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003566
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003567 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003568 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003570 return NULL;
3571 }
3572
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003573 char *str;
3574 size_t error_pos;
3575 const char *reason;
3576 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003577 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003578 PyMem_Free(wstr);
3579
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003580 if (res != 0) {
3581 if (res == -2) {
3582 PyObject *exc;
3583 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3584 "locale", unicode,
3585 (Py_ssize_t)error_pos,
3586 (Py_ssize_t)(error_pos+1),
3587 reason);
3588 if (exc != NULL) {
3589 PyCodec_StrictErrors(exc);
3590 Py_DECREF(exc);
3591 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003593 else if (res == -3) {
3594 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3595 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003596 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003597 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003598 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003599 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003600 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003602 PyObject *bytes = PyBytes_FromString(str);
3603 PyMem_RawFree(str);
3604 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605}
3606
Victor Stinnerad158722010-10-27 00:25:46 +00003607PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003608PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3609{
Victor Stinner709d23d2019-05-02 14:56:30 -04003610 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3611 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003612}
3613
3614PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003615PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003616{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003617 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003618 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003619 return unicode_encode_utf8(unicode,
3620 interp->fs_codec.error_handler,
3621 interp->fs_codec.errors);
3622 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003623#ifndef _Py_FORCE_UTF8_FS_ENCODING
3624 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003625 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003626 interp->fs_codec.encoding,
3627 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003628 }
Victor Stinnerad158722010-10-27 00:25:46 +00003629#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003630 else {
3631 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3632 machinery is not ready and so cannot be used:
3633 use wcstombs() in this case. */
3634 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3635 assert(filesystem_errors != NULL);
3636 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3637 assert(errors != _Py_ERROR_UNKNOWN);
3638#ifdef _Py_FORCE_UTF8_FS_ENCODING
3639 return unicode_encode_utf8(unicode, errors, NULL);
3640#else
3641 return unicode_encode_locale(unicode, errors, 0);
3642#endif
3643 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003644}
3645
Alexander Belopolsky40018472011-02-26 01:02:56 +00003646PyObject *
3647PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003648 const char *encoding,
3649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650{
3651 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003652 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003653
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 if (!PyUnicode_Check(unicode)) {
3655 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 }
Fred Drakee4315f52000-05-09 19:53:39 +00003658
Victor Stinner22eb6892019-06-26 00:51:05 +02003659 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3660 return NULL;
3661 }
3662
Victor Stinner942889a2016-09-05 15:40:10 -07003663 if (encoding == NULL) {
3664 return _PyUnicode_AsUTF8String(unicode, errors);
3665 }
3666
Fred Drakee4315f52000-05-09 19:53:39 +00003667 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003668 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3669 char *lower = buflower;
3670
3671 /* Fast paths */
3672 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3673 lower += 3;
3674 if (*lower == '_') {
3675 /* Match "utf8" and "utf_8" */
3676 lower++;
3677 }
3678
3679 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003681 }
3682 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3683 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3684 }
3685 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3686 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3687 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003688 }
Victor Stinner942889a2016-09-05 15:40:10 -07003689 else {
3690 if (strcmp(lower, "ascii") == 0
3691 || strcmp(lower, "us_ascii") == 0) {
3692 return _PyUnicode_AsASCIIString(unicode, errors);
3693 }
Steve Dowercc16be82016-09-08 10:35:16 -07003694#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003695 else if (strcmp(lower, "mbcs") == 0) {
3696 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3697 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003698#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003699 else if (strcmp(lower, "latin1") == 0 ||
3700 strcmp(lower, "latin_1") == 0 ||
3701 strcmp(lower, "iso_8859_1") == 0 ||
3702 strcmp(lower, "iso8859_1") == 0) {
3703 return _PyUnicode_AsLatin1String(unicode, errors);
3704 }
3705 }
Victor Stinner37296e82010-06-10 13:36:23 +00003706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707
3708 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003709 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003711 return NULL;
3712
3713 /* The normal path */
3714 if (PyBytes_Check(v))
3715 return v;
3716
3717 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003718 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003719 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003720 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003721
3722 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003723 "encoder %s returned bytearray instead of bytes; "
3724 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003725 encoding);
3726 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003727 Py_DECREF(v);
3728 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003729 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003731 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3732 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003733 Py_DECREF(v);
3734 return b;
3735 }
3736
3737 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003738 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003739 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003740 encoding,
3741 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003742 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003743 return NULL;
3744}
3745
Alexander Belopolsky40018472011-02-26 01:02:56 +00003746PyObject *
3747PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003748 const char *encoding,
3749 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003750{
3751 PyObject *v;
3752
3753 if (!PyUnicode_Check(unicode)) {
3754 PyErr_BadArgument();
3755 goto onError;
3756 }
3757
Serhiy Storchaka00939072016-10-27 21:05:49 +03003758 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3759 "PyUnicode_AsEncodedUnicode() is deprecated; "
3760 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3761 return NULL;
3762
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003763 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003765
3766 /* Encode via the codec registry */
3767 v = PyCodec_Encode(unicode, encoding, errors);
3768 if (v == NULL)
3769 goto onError;
3770 if (!PyUnicode_Check(v)) {
3771 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003772 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003773 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003774 encoding,
3775 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003776 Py_DECREF(v);
3777 goto onError;
3778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003780
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 return NULL;
3783}
3784
Victor Stinner2cba6b82018-01-10 22:46:15 +01003785static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003786unicode_decode_locale(const char *str, Py_ssize_t len,
3787 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003788{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003789 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3790 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003791 return NULL;
3792 }
3793
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003794 wchar_t *wstr;
3795 size_t wlen;
3796 const char *reason;
3797 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003798 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003799 if (res != 0) {
3800 if (res == -2) {
3801 PyObject *exc;
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3803 "locale", str, len,
3804 (Py_ssize_t)wlen,
3805 (Py_ssize_t)(wlen + 1),
3806 reason);
3807 if (exc != NULL) {
3808 PyCodec_StrictErrors(exc);
3809 Py_DECREF(exc);
3810 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003811 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003812 else if (res == -3) {
3813 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3814 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003815 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003816 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003817 }
Victor Stinner2f197072011-12-17 07:08:30 +01003818 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003819 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003820
3821 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3822 PyMem_RawFree(wstr);
3823 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824}
3825
3826PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003827PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3828 const char *errors)
3829{
Victor Stinner709d23d2019-05-02 14:56:30 -04003830 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3831 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003832}
3833
3834PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003835PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003836{
3837 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003838 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3839 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003840}
3841
3842
3843PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003844PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003846 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3847}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848
Christian Heimes5894ba72007-11-04 11:43:14 +00003849PyObject*
3850PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3851{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003852 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003853 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 return unicode_decode_utf8(s, size,
3855 interp->fs_codec.error_handler,
3856 interp->fs_codec.errors,
3857 NULL);
3858 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003859#ifndef _Py_FORCE_UTF8_FS_ENCODING
3860 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003861 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003862 interp->fs_codec.encoding,
3863 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003864 }
Victor Stinnerad158722010-10-27 00:25:46 +00003865#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003866 else {
3867 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3868 machinery is not ready and so cannot be used:
3869 use mbstowcs() in this case. */
3870 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3871 assert(filesystem_errors != NULL);
3872 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3873 assert(errors != _Py_ERROR_UNKNOWN);
3874#ifdef _Py_FORCE_UTF8_FS_ENCODING
3875 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3876#else
3877 return unicode_decode_locale(s, size, errors, 0);
3878#endif
3879 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003880}
3881
Martin v. Löwis011e8422009-05-05 04:43:17 +00003882
3883int
3884PyUnicode_FSConverter(PyObject* arg, void* addr)
3885{
Brett Cannonec6ce872016-09-06 15:50:29 -07003886 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887 PyObject *output = NULL;
3888 Py_ssize_t size;
3889 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003890 if (arg == NULL) {
3891 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003892 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return 1;
3894 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003895 path = PyOS_FSPath(arg);
3896 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003897 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003899 if (PyBytes_Check(path)) {
3900 output = path;
3901 }
3902 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3903 output = PyUnicode_EncodeFSDefault(path);
3904 Py_DECREF(path);
3905 if (!output) {
3906 return 0;
3907 }
3908 assert(PyBytes_Check(output));
3909 }
3910
Victor Stinner0ea2a462010-04-30 00:22:08 +00003911 size = PyBytes_GET_SIZE(output);
3912 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003913 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003914 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003915 Py_DECREF(output);
3916 return 0;
3917 }
3918 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003919 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003920}
3921
3922
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003923int
3924PyUnicode_FSDecoder(PyObject* arg, void* addr)
3925{
Brett Cannona5711202016-09-06 19:36:01 -07003926 int is_buffer = 0;
3927 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003928 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929 if (arg == NULL) {
3930 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003931 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 return 1;
3933 }
Brett Cannona5711202016-09-06 19:36:01 -07003934
3935 is_buffer = PyObject_CheckBuffer(arg);
3936 if (!is_buffer) {
3937 path = PyOS_FSPath(arg);
3938 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003939 return 0;
3940 }
Brett Cannona5711202016-09-06 19:36:01 -07003941 }
3942 else {
3943 path = arg;
3944 Py_INCREF(arg);
3945 }
3946
3947 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003948 output = path;
3949 }
3950 else if (PyBytes_Check(path) || is_buffer) {
3951 PyObject *path_bytes = NULL;
3952
3953 if (!PyBytes_Check(path) &&
3954 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003955 "path should be string, bytes, or os.PathLike, not %.200s",
3956 Py_TYPE(arg)->tp_name)) {
3957 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003958 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003959 }
3960 path_bytes = PyBytes_FromObject(path);
3961 Py_DECREF(path);
3962 if (!path_bytes) {
3963 return 0;
3964 }
3965 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3966 PyBytes_GET_SIZE(path_bytes));
3967 Py_DECREF(path_bytes);
3968 if (!output) {
3969 return 0;
3970 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003972 else {
3973 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003974 "path should be string, bytes, or os.PathLike, not %.200s",
3975 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003976 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003977 return 0;
3978 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003979 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003980 Py_DECREF(output);
3981 return 0;
3982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003984 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003985 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003986 Py_DECREF(output);
3987 return 0;
3988 }
3989 *(PyObject**)addr = output;
3990 return Py_CLEANUP_SUPPORTED;
3991}
3992
3993
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003994const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003996{
Christian Heimesf3863112007-11-22 07:46:41 +00003997 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003999 if (!PyUnicode_Check(unicode)) {
4000 PyErr_BadArgument();
4001 return NULL;
4002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004003 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004004 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004006 if (PyUnicode_UTF8(unicode) == NULL) {
4007 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004008 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 if (bytes == NULL)
4010 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004011 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4012 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004013 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 Py_DECREF(bytes);
4015 return NULL;
4016 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004018 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004019 PyBytes_AS_STRING(bytes),
4020 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 Py_DECREF(bytes);
4022 }
4023
4024 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004025 *psize = PyUnicode_UTF8_LENGTH(unicode);
4026 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004027}
4028
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004029const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004031{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4033}
4034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035Py_UNICODE *
4036PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4037{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 if (!PyUnicode_Check(unicode)) {
4039 PyErr_BadArgument();
4040 return NULL;
4041 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004042 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4043 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004045 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047
Serhiy Storchakac46db922018-10-23 22:58:24 +03004048 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4049 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4050 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004053 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4054 if (w == NULL) {
4055 PyErr_NoMemory();
4056 return NULL;
4057 }
4058 unicode_copy_as_widechar(unicode, w, wlen + 1);
4059 _PyUnicode_WSTR(unicode) = w;
4060 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4061 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 }
4063 }
4064 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004065 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004066 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004067}
4068
Alexander Belopolsky40018472011-02-26 01:02:56 +00004069Py_UNICODE *
4070PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073}
4074
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004075const Py_UNICODE *
4076_PyUnicode_AsUnicode(PyObject *unicode)
4077{
4078 Py_ssize_t size;
4079 const Py_UNICODE *wstr;
4080
4081 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4082 if (wstr && wcslen(wstr) != (size_t)size) {
4083 PyErr_SetString(PyExc_ValueError, "embedded null character");
4084 return NULL;
4085 }
4086 return wstr;
4087}
4088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090Py_ssize_t
4091PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092{
4093 if (!PyUnicode_Check(unicode)) {
4094 PyErr_BadArgument();
4095 goto onError;
4096 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004097 if (_PyUnicode_WSTR(unicode) == NULL) {
4098 if (PyUnicode_AsUnicode(unicode) == NULL)
4099 goto onError;
4100 }
4101 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 return -1;
4105}
4106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107Py_ssize_t
4108PyUnicode_GetLength(PyObject *unicode)
4109{
Victor Stinner07621332012-06-16 04:53:46 +02004110 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111 PyErr_BadArgument();
4112 return -1;
4113 }
Victor Stinner07621332012-06-16 04:53:46 +02004114 if (PyUnicode_READY(unicode) == -1)
4115 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 return PyUnicode_GET_LENGTH(unicode);
4117}
4118
4119Py_UCS4
4120PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4121{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004122 void *data;
4123 int kind;
4124
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004125 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004126 PyErr_BadArgument();
4127 return (Py_UCS4)-1;
4128 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004129 if (PyUnicode_READY(unicode) == -1) {
4130 return (Py_UCS4)-1;
4131 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004132 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004133 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 return (Py_UCS4)-1;
4135 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004136 data = PyUnicode_DATA(unicode);
4137 kind = PyUnicode_KIND(unicode);
4138 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139}
4140
4141int
4142PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4143{
4144 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004145 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 return -1;
4147 }
Victor Stinner488fa492011-12-12 00:01:39 +01004148 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004149 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004150 PyErr_SetString(PyExc_IndexError, "string index out of range");
4151 return -1;
4152 }
Victor Stinner488fa492011-12-12 00:01:39 +01004153 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004154 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004155 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4156 PyErr_SetString(PyExc_ValueError, "character out of range");
4157 return -1;
4158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4160 index, ch);
4161 return 0;
4162}
4163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164const char *
4165PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004166{
Victor Stinner42cb4622010-09-01 19:39:01 +00004167 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004168}
4169
Victor Stinner554f3f02010-06-16 23:33:54 +00004170/* create or adjust a UnicodeDecodeError */
4171static void
4172make_decode_exception(PyObject **exceptionObject,
4173 const char *encoding,
4174 const char *input, Py_ssize_t length,
4175 Py_ssize_t startpos, Py_ssize_t endpos,
4176 const char *reason)
4177{
4178 if (*exceptionObject == NULL) {
4179 *exceptionObject = PyUnicodeDecodeError_Create(
4180 encoding, input, length, startpos, endpos, reason);
4181 }
4182 else {
4183 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4186 goto onError;
4187 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4188 goto onError;
4189 }
4190 return;
4191
4192onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004193 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004194}
4195
Steve Dowercc16be82016-09-08 10:35:16 -07004196#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004197static int
4198widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4199{
4200 if (newsize > *size) {
4201 wchar_t *newbuf = *buf;
4202 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4203 PyErr_NoMemory();
4204 return -1;
4205 }
4206 *buf = newbuf;
4207 }
4208 *size = newsize;
4209 return 0;
4210}
4211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212/* error handling callback helper:
4213 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004214 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 and adjust various state variables.
4216 return 0 on success, -1 on error
4217*/
4218
Alexander Belopolsky40018472011-02-26 01:02:56 +00004219static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004220unicode_decode_call_errorhandler_wchar(
4221 const char *errors, PyObject **errorHandler,
4222 const char *encoding, const char *reason,
4223 const char **input, const char **inend, Py_ssize_t *startinpos,
4224 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004225 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004227 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228
4229 PyObject *restuple = NULL;
4230 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004231 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004232 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004233 Py_ssize_t requiredsize;
4234 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004235 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004236 wchar_t *repwstr;
4237 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238
4239 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 *errorHandler = PyCodec_LookupError(errors);
4241 if (*errorHandler == NULL)
4242 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 }
4244
Victor Stinner554f3f02010-06-16 23:33:54 +00004245 make_decode_exception(exceptionObject,
4246 encoding,
4247 *input, *inend - *input,
4248 *startinpos, *endinpos,
4249 reason);
4250 if (*exceptionObject == NULL)
4251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004253 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004257 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004260 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004262
4263 /* Copy back the bytes variables, which might have been modified by the
4264 callback */
4265 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4266 if (!inputobj)
4267 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 *input = PyBytes_AS_STRING(inputobj);
4269 insize = PyBytes_GET_SIZE(inputobj);
4270 *inend = *input + insize;
4271 /* we can DECREF safely, as the exception has another reference,
4272 so the object won't go away. */
4273 Py_DECREF(inputobj);
4274
4275 if (newpos<0)
4276 newpos = insize+newpos;
4277 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004278 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 goto onError;
4280 }
4281
4282 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4283 if (repwstr == NULL)
4284 goto onError;
4285 /* need more space? (at least enough for what we
4286 have+the replacement+the rest of the string (starting
4287 at the new input position), so we won't have to check space
4288 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004289 requiredsize = *outpos;
4290 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4291 goto overflow;
4292 requiredsize += repwlen;
4293 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4294 goto overflow;
4295 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004296 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004298 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004300 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004302 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004304 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 *endinpos = newpos;
4307 *inptr = *input + newpos;
4308
4309 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004310 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 return 0;
4312
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004313 overflow:
4314 PyErr_SetString(PyExc_OverflowError,
4315 "decoded result is too long for a Python string");
4316
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004317 onError:
4318 Py_XDECREF(restuple);
4319 return -1;
4320}
Steve Dowercc16be82016-09-08 10:35:16 -07004321#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322
4323static int
4324unicode_decode_call_errorhandler_writer(
4325 const char *errors, PyObject **errorHandler,
4326 const char *encoding, const char *reason,
4327 const char **input, const char **inend, Py_ssize_t *startinpos,
4328 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4329 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4330{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004331 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332
4333 PyObject *restuple = NULL;
4334 PyObject *repunicode = NULL;
4335 Py_ssize_t insize;
4336 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004337 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004338 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004340 int need_to_grow = 0;
4341 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342
4343 if (*errorHandler == NULL) {
4344 *errorHandler = PyCodec_LookupError(errors);
4345 if (*errorHandler == NULL)
4346 goto onError;
4347 }
4348
4349 make_decode_exception(exceptionObject,
4350 encoding,
4351 *input, *inend - *input,
4352 *startinpos, *endinpos,
4353 reason);
4354 if (*exceptionObject == NULL)
4355 goto onError;
4356
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004357 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004358 if (restuple == NULL)
4359 goto onError;
4360 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004361 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 goto onError;
4363 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004364 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004366
4367 /* Copy back the bytes variables, which might have been modified by the
4368 callback */
4369 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4370 if (!inputobj)
4371 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004372 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004373 *input = PyBytes_AS_STRING(inputobj);
4374 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004375 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004376 /* we can DECREF safely, as the exception has another reference,
4377 so the object won't go away. */
4378 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004382 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004383 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386
Victor Stinner170ca6f2013-04-18 00:25:28 +02004387 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004388 if (replen > 1) {
4389 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004390 need_to_grow = 1;
4391 }
4392 new_inptr = *input + newpos;
4393 if (*inend - new_inptr > remain) {
4394 /* We don't know the decoding algorithm here so we make the worst
4395 assumption that one byte decodes to one unicode character.
4396 If unfortunately one byte could decode to more unicode characters,
4397 the decoder may write out-of-bound then. Is it possible for the
4398 algorithms using this function? */
4399 writer->min_length += *inend - new_inptr - remain;
4400 need_to_grow = 1;
4401 }
4402 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004403 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004404 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004405 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4406 goto onError;
4407 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004409 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004412 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004415 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421}
4422
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423/* --- UTF-7 Codec -------------------------------------------------------- */
4424
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425/* See RFC2152 for details. We encode conservatively and decode liberally. */
4426
4427/* Three simple macros defining base-64. */
4428
4429/* Is c a base-64 character? */
4430
4431#define IS_BASE64(c) \
4432 (((c) >= 'A' && (c) <= 'Z') || \
4433 ((c) >= 'a' && (c) <= 'z') || \
4434 ((c) >= '0' && (c) <= '9') || \
4435 (c) == '+' || (c) == '/')
4436
4437/* given that c is a base-64 character, what is its base-64 value? */
4438
4439#define FROM_BASE64(c) \
4440 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4441 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4442 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4443 (c) == '+' ? 62 : 63)
4444
4445/* What is the base-64 character of the bottom 6 bits of n? */
4446
4447#define TO_BASE64(n) \
4448 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4449
4450/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4451 * decoded as itself. We are permissive on decoding; the only ASCII
4452 * byte not decoding to itself is the + which begins a base64
4453 * string. */
4454
4455#define DECODE_DIRECT(c) \
4456 ((c) <= 127 && (c) != '+')
4457
4458/* The UTF-7 encoder treats ASCII characters differently according to
4459 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4460 * the above). See RFC2152. This array identifies these different
4461 * sets:
4462 * 0 : "Set D"
4463 * alphanumeric and '(),-./:?
4464 * 1 : "Set O"
4465 * !"#$%&*;<=>@[]^_`{|}
4466 * 2 : "whitespace"
4467 * ht nl cr sp
4468 * 3 : special (must be base64 encoded)
4469 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4470 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471
Tim Petersced69f82003-09-16 20:30:58 +00004472static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473char utf7_category[128] = {
4474/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4475 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4476/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4477 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4478/* sp ! " # $ % & ' ( ) * + , - . / */
4479 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4480/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4482/* @ A B C D E F G H I J K L M N O */
4483 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4484/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4486/* ` a b c d e f g h i j k l m n o */
4487 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4488/* p q r s t u v w x y z { | } ~ del */
4489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490};
4491
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492/* ENCODE_DIRECT: this character should be encoded as itself. The
4493 * answer depends on whether we are encoding set O as itself, and also
4494 * on whether we are encoding whitespace as itself. RFC2152 makes it
4495 * clear that the answers to these questions vary between
4496 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004497
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498#define ENCODE_DIRECT(c, directO, directWS) \
4499 ((c) < 128 && (c) > 0 && \
4500 ((utf7_category[(c)] == 0) || \
4501 (directWS && (utf7_category[(c)] == 2)) || \
4502 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503
Alexander Belopolsky40018472011-02-26 01:02:56 +00004504PyObject *
4505PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004506 Py_ssize_t size,
4507 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004509 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4510}
4511
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512/* The decoder. The only state we preserve is our read position,
4513 * i.e. how many characters we have consumed. So if we end in the
4514 * middle of a shift sequence we have to back off the read position
4515 * and the output to the beginning of the sequence, otherwise we lose
4516 * all the shift state (seen bits, number of bits seen, high
4517 * surrogate). */
4518
Alexander Belopolsky40018472011-02-26 01:02:56 +00004519PyObject *
4520PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004521 Py_ssize_t size,
4522 const char *errors,
4523 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004526 Py_ssize_t startinpos;
4527 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 const char *errmsg = "";
4531 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 unsigned int base64bits = 0;
4534 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004535 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 PyObject *errorHandler = NULL;
4537 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004539 if (size == 0) {
4540 if (consumed)
4541 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004542 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004545 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004546 _PyUnicodeWriter_Init(&writer);
4547 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004548
4549 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 e = s + size;
4551
4552 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004553 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004555 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 if (inShift) { /* in a base-64 section */
4558 if (IS_BASE64(ch)) { /* consume a base-64 character */
4559 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4560 base64bits += 6;
4561 s++;
4562 if (base64bits >= 16) {
4563 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004564 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 base64bits -= 16;
4566 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004567 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 if (surrogate) {
4569 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004570 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4571 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004572 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004573 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004575 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
4577 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004578 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004579 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 }
4582 }
Victor Stinner551ac952011-11-29 22:58:13 +01004583 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 /* first surrogate */
4585 surrogate = outCh;
4586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 }
4591 }
4592 }
4593 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (base64bits > 0) { /* left-over bits */
4596 if (base64bits >= 6) {
4597 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004598 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 errmsg = "partial character in shift sequence";
4600 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004601 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 else {
4603 /* Some bits remain; they should be zero */
4604 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004605 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 errmsg = "non-zero padding bits in shift sequence";
4607 goto utf7Error;
4608 }
4609 }
4610 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004611 if (surrogate && DECODE_DIRECT(ch)) {
4612 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4613 goto onError;
4614 }
4615 surrogate = 0;
4616 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 /* '-' is absorbed; other terminating
4618 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004619 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 }
4622 }
4623 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 s++; /* consume '+' */
4626 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004628 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004629 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004631 else if (s < e && !IS_BASE64(*s)) {
4632 s++;
4633 errmsg = "ill-formed sequence";
4634 goto utf7Error;
4635 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004638 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004639 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004641 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 }
4643 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004646 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 else {
4650 startinpos = s-starts;
4651 s++;
4652 errmsg = "unexpected special character";
4653 goto utf7Error;
4654 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 errors, &errorHandler,
4660 "utf7", errmsg,
4661 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 }
4665
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 /* end of string */
4667
4668 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4669 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004670 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 if (surrogate ||
4672 (base64bits >= 6) ||
4673 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 errors, &errorHandler,
4677 "utf7", "unterminated shift sequence",
4678 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004679 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 goto onError;
4681 if (s < e)
4682 goto restart;
4683 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685
4686 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004689 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004690 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004691 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004692 writer.kind, writer.data, shiftOutStart);
4693 Py_XDECREF(errorHandler);
4694 Py_XDECREF(exc);
4695 _PyUnicodeWriter_Dealloc(&writer);
4696 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004697 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004698 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 }
4700 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004707 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_XDECREF(errorHandler);
4711 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713 return NULL;
4714}
4715
4716
Alexander Belopolsky40018472011-02-26 01:02:56 +00004717PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004718_PyUnicode_EncodeUTF7(PyObject *str,
4719 int base64SetO,
4720 int base64WhiteSpace,
4721 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723 int kind;
4724 void *data;
4725 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004726 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004728 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 unsigned int base64bits = 0;
4730 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731 char * out;
4732 char * start;
4733
Benjamin Petersonbac79492012-01-14 13:34:47 -05004734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004735 return NULL;
4736 kind = PyUnicode_KIND(str);
4737 data = PyUnicode_DATA(str);
4738 len = PyUnicode_GET_LENGTH(str);
4739
4740 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004744 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004745 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004746 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 if (v == NULL)
4748 return NULL;
4749
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004752 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753
Antoine Pitrou244651a2009-05-04 18:56:13 +00004754 if (inShift) {
4755 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4756 /* shifting out */
4757 if (base64bits) { /* output remaining bits */
4758 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4759 base64buffer = 0;
4760 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761 }
4762 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 /* Characters not in the BASE64 set implicitly unshift the sequence
4764 so no '-' is required, except if the character is itself a '-' */
4765 if (IS_BASE64(ch) || ch == '-') {
4766 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004767 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 *out++ = (char) ch;
4769 }
4770 else {
4771 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004772 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 else { /* not in a shift sequence */
4775 if (ch == '+') {
4776 *out++ = '+';
4777 *out++ = '-';
4778 }
4779 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4780 *out++ = (char) ch;
4781 }
4782 else {
4783 *out++ = '+';
4784 inShift = 1;
4785 goto encode_char;
4786 }
4787 }
4788 continue;
4789encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004791 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004792
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 /* code first surrogate */
4794 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004795 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 while (base64bits >= 6) {
4797 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4798 base64bits -= 6;
4799 }
4800 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004801 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004803 base64bits += 16;
4804 base64buffer = (base64buffer << 16) | ch;
4805 while (base64bits >= 6) {
4806 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4807 base64bits -= 6;
4808 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 if (base64bits)
4811 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4812 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004814 if (_PyBytes_Resize(&v, out - start) < 0)
4815 return NULL;
4816 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004817}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004818PyObject *
4819PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4820 Py_ssize_t size,
4821 int base64SetO,
4822 int base64WhiteSpace,
4823 const char *errors)
4824{
4825 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004826 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004827 if (tmp == NULL)
4828 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004829 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004830 base64WhiteSpace, errors);
4831 Py_DECREF(tmp);
4832 return result;
4833}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835#undef IS_BASE64
4836#undef FROM_BASE64
4837#undef TO_BASE64
4838#undef DECODE_DIRECT
4839#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841/* --- UTF-8 Codec -------------------------------------------------------- */
4842
Alexander Belopolsky40018472011-02-26 01:02:56 +00004843PyObject *
4844PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004845 Py_ssize_t size,
4846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Walter Dörwald69652032004-09-07 20:24:22 +00004848 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4849}
4850
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851#include "stringlib/asciilib.h"
4852#include "stringlib/codecs.h"
4853#include "stringlib/undef.h"
4854
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004855#include "stringlib/ucs1lib.h"
4856#include "stringlib/codecs.h"
4857#include "stringlib/undef.h"
4858
4859#include "stringlib/ucs2lib.h"
4860#include "stringlib/codecs.h"
4861#include "stringlib/undef.h"
4862
4863#include "stringlib/ucs4lib.h"
4864#include "stringlib/codecs.h"
4865#include "stringlib/undef.h"
4866
Antoine Pitrouab868312009-01-10 15:40:25 +00004867/* Mask to quickly check whether a C 'long' contains a
4868 non-ASCII, UTF8-encoded char. */
4869#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004870# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004871#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004872# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004873#else
4874# error C 'long' size should be either 4 or 8!
4875#endif
4876
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877static Py_ssize_t
4878ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004881 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004883 /*
4884 * Issue #17237: m68k is a bit different from most architectures in
4885 * that objects do not use "natural alignment" - for example, int and
4886 * long are only aligned at 2-byte boundaries. Therefore the assert()
4887 * won't work; also, tests have shown that skipping the "optimised
4888 * version" will even speed up m68k.
4889 */
4890#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4893 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 /* Fast path, see in STRINGLIB(utf8_decode) for
4895 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004896 /* Help allocation */
4897 const char *_p = p;
4898 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 while (_p < aligned_end) {
4900 unsigned long value = *(const unsigned long *) _p;
4901 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 *((unsigned long *)q) = value;
4904 _p += SIZEOF_LONG;
4905 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004906 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 p = _p;
4908 while (p < end) {
4909 if ((unsigned char)*p & 0x80)
4910 break;
4911 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004916#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 while (p < end) {
4918 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4919 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004920 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004921 /* Help allocation */
4922 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 while (_p < aligned_end) {
4924 unsigned long value = *(unsigned long *) _p;
4925 if (value & ASCII_CHAR_MASK)
4926 break;
4927 _p += SIZEOF_LONG;
4928 }
4929 p = _p;
4930 if (_p == end)
4931 break;
4932 }
4933 if ((unsigned char)*p & 0x80)
4934 break;
4935 ++p;
4936 }
4937 memcpy(dest, start, p - start);
4938 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939}
Antoine Pitrouab868312009-01-10 15:40:25 +00004940
Victor Stinner709d23d2019-05-02 14:56:30 -04004941static PyObject *
4942unicode_decode_utf8(const char *s, Py_ssize_t size,
4943 _Py_error_handler error_handler, const char *errors,
4944 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004945{
Victor Stinner785938e2011-12-11 20:09:03 +01004946 if (size == 0) {
4947 if (consumed)
4948 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004949 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004950 }
4951
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4953 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004954 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 *consumed = 1;
4956 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004957 }
4958
Inada Naoki770847a2019-06-24 12:30:24 +09004959 const char *starts = s;
4960 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004961
Inada Naoki770847a2019-06-24 12:30:24 +09004962 // fast path: try ASCII string.
4963 PyObject *u = PyUnicode_New(size, 127);
4964 if (u == NULL) {
4965 return NULL;
4966 }
4967 s += ascii_decode(s, end, PyUnicode_DATA(u));
4968 if (s == end) {
4969 return u;
4970 }
4971
4972 // Use _PyUnicodeWriter after fast path is failed.
4973 _PyUnicodeWriter writer;
4974 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4975 writer.pos = s - starts;
4976
4977 Py_ssize_t startinpos, endinpos;
4978 const char *errmsg = "";
4979 PyObject *error_handler_obj = NULL;
4980 PyObject *exc = NULL;
4981
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 while (s < end) {
4983 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004985
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 if (PyUnicode_IS_ASCII(writer.buffer))
4988 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004992 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 } else {
4994 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 }
4997
4998 switch (ch) {
4999 case 0:
5000 if (s == end || consumed)
5001 goto End;
5002 errmsg = "unexpected end of data";
5003 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005004 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 break;
5006 case 1:
5007 errmsg = "invalid start byte";
5008 startinpos = s - starts;
5009 endinpos = startinpos + 1;
5010 break;
5011 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005012 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5013 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5014 {
5015 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005016 goto End;
5017 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005018 /* fall through */
5019 case 3:
5020 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 goto onError;
5028 continue;
5029 }
5030
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005032 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005046 {
5047 Py_ssize_t i;
5048
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
Victor Stinner785938e2011-12-11 20:09:03 +01005069 }
5070
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 if (consumed)
5073 *consumed = s - starts;
5074
Victor Stinner1d65d912015-10-05 13:43:50 +02005075 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078
5079onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005084}
5085
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086
Victor Stinner709d23d2019-05-02 14:56:30 -04005087PyObject *
5088PyUnicode_DecodeUTF8Stateful(const char *s,
5089 Py_ssize_t size,
5090 const char *errors,
5091 Py_ssize_t *consumed)
5092{
5093 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5094}
5095
5096
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005097/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5098 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005099
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005100 On success, write a pointer to a newly allocated wide character string into
5101 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5102 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 On memory allocation failure, return -1.
5105
5106 On decoding error (if surrogateescape is zero), return -2. If wlen is
5107 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5108 is not NULL, write the decoding error message into *reason. */
5109int
5110_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005111 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005113 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 wchar_t *unicode;
5116 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117
Victor Stinner3d4226a2018-08-29 22:21:32 +02005118 int surrogateescape = 0;
5119 int surrogatepass = 0;
5120 switch (errors)
5121 {
5122 case _Py_ERROR_STRICT:
5123 break;
5124 case _Py_ERROR_SURROGATEESCAPE:
5125 surrogateescape = 1;
5126 break;
5127 case _Py_ERROR_SURROGATEPASS:
5128 surrogatepass = 1;
5129 break;
5130 default:
5131 return -3;
5132 }
5133
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134 /* Note: size will always be longer than the resulting Unicode
5135 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005137 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005138 }
5139
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005140 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005141 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005143 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144
5145 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005154#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 if (ch > 0xFF) {
5156#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005157 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005159 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005161 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5162 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5163#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005164 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005166 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005169
5170 if (surrogateescape) {
5171 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5172 }
5173 else {
5174 /* Is it a valid three-byte code? */
5175 if (surrogatepass
5176 && (e - s) >= 3
5177 && (s[0] & 0xf0) == 0xe0
5178 && (s[1] & 0xc0) == 0x80
5179 && (s[2] & 0xc0) == 0x80)
5180 {
5181 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5182 s += 3;
5183 unicode[outpos++] = ch;
5184 }
5185 else {
5186 PyMem_RawFree(unicode );
5187 if (reason != NULL) {
5188 switch (ch) {
5189 case 0:
5190 *reason = "unexpected end of data";
5191 break;
5192 case 1:
5193 *reason = "invalid start byte";
5194 break;
5195 /* 2, 3, 4 */
5196 default:
5197 *reason = "invalid continuation byte";
5198 break;
5199 }
5200 }
5201 if (wlen != NULL) {
5202 *wlen = s - orig_s;
5203 }
5204 return -2;
5205 }
5206 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005208 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005210 if (wlen) {
5211 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005212 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005213 *wstr = unicode;
5214 return 0;
5215}
5216
Victor Stinner5f9cf232019-03-19 01:46:25 +01005217
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005219_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5220 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005221{
5222 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005223 int res = _Py_DecodeUTF8Ex(arg, arglen,
5224 &wstr, wlen,
5225 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005226 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005227 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5228 assert(res != -3);
5229 if (wlen) {
5230 *wlen = (size_t)res;
5231 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005232 return NULL;
5233 }
5234 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005235}
5236
Antoine Pitrouab868312009-01-10 15:40:25 +00005237
Victor Stinnere47e6982017-12-21 15:45:16 +01005238/* UTF-8 encoder using the surrogateescape error handler .
5239
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005240 On success, return 0 and write the newly allocated character string (use
5241 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005242
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 On encoding failure, return -2 and write the position of the invalid
5244 surrogate character into *error_pos (if error_pos is set) and the decoding
5245 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005246
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005247 On memory allocation failure, return -1. */
5248int
5249_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005250 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005251{
5252 const Py_ssize_t max_char_size = 4;
5253 Py_ssize_t len = wcslen(text);
5254
5255 assert(len >= 0);
5256
Victor Stinner3d4226a2018-08-29 22:21:32 +02005257 int surrogateescape = 0;
5258 int surrogatepass = 0;
5259 switch (errors)
5260 {
5261 case _Py_ERROR_STRICT:
5262 break;
5263 case _Py_ERROR_SURROGATEESCAPE:
5264 surrogateescape = 1;
5265 break;
5266 case _Py_ERROR_SURROGATEPASS:
5267 surrogatepass = 1;
5268 break;
5269 default:
5270 return -3;
5271 }
5272
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005273 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5274 return -1;
5275 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005276 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005277 if (raw_malloc) {
5278 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005279 }
5280 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005281 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005282 }
5283 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005284 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005285 }
5286
5287 char *p = bytes;
5288 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005289 for (i = 0; i < len; ) {
5290 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005291 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005292 i++;
5293#if Py_UNICODE_SIZE == 2
5294 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5295 && i < len
5296 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5297 {
5298 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5299 i++;
5300 }
5301#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005302
5303 if (ch < 0x80) {
5304 /* Encode ASCII */
5305 *p++ = (char) ch;
5306
5307 }
5308 else if (ch < 0x0800) {
5309 /* Encode Latin-1 */
5310 *p++ = (char)(0xc0 | (ch >> 6));
5311 *p++ = (char)(0x80 | (ch & 0x3f));
5312 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005313 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005315 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005316 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005317 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005318 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005319 if (reason != NULL) {
5320 *reason = "encoding error";
5321 }
5322 if (raw_malloc) {
5323 PyMem_RawFree(bytes);
5324 }
5325 else {
5326 PyMem_Free(bytes);
5327 }
5328 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005329 }
5330 *p++ = (char)(ch & 0xff);
5331 }
5332 else if (ch < 0x10000) {
5333 *p++ = (char)(0xe0 | (ch >> 12));
5334 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5335 *p++ = (char)(0x80 | (ch & 0x3f));
5336 }
5337 else { /* ch >= 0x10000 */
5338 assert(ch <= MAX_UNICODE);
5339 /* Encode UCS4 Unicode ordinals */
5340 *p++ = (char)(0xf0 | (ch >> 18));
5341 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5343 *p++ = (char)(0x80 | (ch & 0x3f));
5344 }
5345 }
5346 *p++ = '\0';
5347
5348 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005349 char *bytes2;
5350 if (raw_malloc) {
5351 bytes2 = PyMem_RawRealloc(bytes, final_size);
5352 }
5353 else {
5354 bytes2 = PyMem_Realloc(bytes, final_size);
5355 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005356 if (bytes2 == NULL) {
5357 if (error_pos != NULL) {
5358 *error_pos = (size_t)-1;
5359 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 if (raw_malloc) {
5361 PyMem_RawFree(bytes);
5362 }
5363 else {
5364 PyMem_Free(bytes);
5365 }
5366 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005367 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005368 *str = bytes2;
5369 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005370}
5371
5372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373/* Primary internal function which creates utf8 encoded bytes objects.
5374
5375 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005376 and allocate exactly as much space needed at the end. Else allocate the
5377 maximum possible needed (4 result bytes per Unicode character), and return
5378 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005379*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005380static PyObject *
5381unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383{
Victor Stinner6099a032011-12-18 14:22:26 +01005384 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005385 void *data;
5386 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005388 if (!PyUnicode_Check(unicode)) {
5389 PyErr_BadArgument();
5390 return NULL;
5391 }
5392
5393 if (PyUnicode_READY(unicode) == -1)
5394 return NULL;
5395
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005396 if (PyUnicode_UTF8(unicode))
5397 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5398 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005399
5400 kind = PyUnicode_KIND(unicode);
5401 data = PyUnicode_DATA(unicode);
5402 size = PyUnicode_GET_LENGTH(unicode);
5403
Benjamin Petersonead6b532011-12-20 17:23:42 -06005404 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005405 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005406 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005407 case PyUnicode_1BYTE_KIND:
5408 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5409 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005410 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005411 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005412 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005413 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005414 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416}
5417
Alexander Belopolsky40018472011-02-26 01:02:56 +00005418PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005419_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5420{
5421 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5422}
5423
5424
5425PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5427 Py_ssize_t size,
5428 const char *errors)
5429{
5430 PyObject *v, *unicode;
5431
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005432 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 if (unicode == NULL)
5434 return NULL;
5435 v = _PyUnicode_AsUTF8String(unicode, errors);
5436 Py_DECREF(unicode);
5437 return v;
5438}
5439
5440PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005441PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444}
5445
Walter Dörwald41980ca2007-08-16 21:55:45 +00005446/* --- UTF-32 Codec ------------------------------------------------------- */
5447
5448PyObject *
5449PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 Py_ssize_t size,
5451 const char *errors,
5452 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005453{
5454 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5455}
5456
5457PyObject *
5458PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 Py_ssize_t size,
5460 const char *errors,
5461 int *byteorder,
5462 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005463{
5464 const char *starts = s;
5465 Py_ssize_t startinpos;
5466 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005467 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005468 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005469 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005471 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005472 PyObject *errorHandler = NULL;
5473 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005474
Walter Dörwald41980ca2007-08-16 21:55:45 +00005475 q = (unsigned char *)s;
5476 e = q + size;
5477
5478 if (byteorder)
5479 bo = *byteorder;
5480
5481 /* Check for BOM marks (U+FEFF) in the input and adjust current
5482 byte order setting accordingly. In native mode, the leading BOM
5483 mark is skipped, in all other modes, it is copied to the output
5484 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005486 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 if (bom == 0x0000FEFF) {
5488 bo = -1;
5489 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005491 else if (bom == 0xFFFE0000) {
5492 bo = 1;
5493 q += 4;
5494 }
5495 if (byteorder)
5496 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005497 }
5498
Victor Stinnere64322e2012-10-30 23:12:47 +01005499 if (q == e) {
5500 if (consumed)
5501 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005502 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005503 }
5504
Victor Stinnere64322e2012-10-30 23:12:47 +01005505#ifdef WORDS_BIGENDIAN
5506 le = bo < 0;
5507#else
5508 le = bo <= 0;
5509#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005511
Victor Stinner8f674cc2013-04-17 23:02:17 +02005512 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005513 writer.min_length = (e - q + 3) / 4;
5514 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005515 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005516
Victor Stinnere64322e2012-10-30 23:12:47 +01005517 while (1) {
5518 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005520
Victor Stinnere64322e2012-10-30 23:12:47 +01005521 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005522 enum PyUnicode_Kind kind = writer.kind;
5523 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005524 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005525 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005526 if (le) {
5527 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005528 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005529 if (ch > maxch)
5530 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005531 if (kind != PyUnicode_1BYTE_KIND &&
5532 Py_UNICODE_IS_SURROGATE(ch))
5533 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005534 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005535 q += 4;
5536 } while (q <= last);
5537 }
5538 else {
5539 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005540 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005541 if (ch > maxch)
5542 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 if (kind != PyUnicode_1BYTE_KIND &&
5544 Py_UNICODE_IS_SURROGATE(ch))
5545 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005546 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005547 q += 4;
5548 } while (q <= last);
5549 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005550 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005551 }
5552
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005554 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 startinpos = ((const char *)q) - starts;
5556 endinpos = startinpos + 4;
5557 }
5558 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005559 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005561 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005563 startinpos = ((const char *)q) - starts;
5564 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005566 else {
5567 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005568 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005569 goto onError;
5570 q += 4;
5571 continue;
5572 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005573 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 startinpos = ((const char *)q) - starts;
5575 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005577
5578 /* The remaining input chars are ignored if the callback
5579 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005580 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005584 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 }
5587
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590
Walter Dörwald41980ca2007-08-16 21:55:45 +00005591 Py_XDECREF(errorHandler);
5592 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005593 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005596 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
5599 return NULL;
5600}
5601
5602PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603_PyUnicode_EncodeUTF32(PyObject *str,
5604 const char *errors,
5605 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005606{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005607 enum PyUnicode_Kind kind;
5608 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005609 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005610 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005611 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005612#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005613 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005615 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005616#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005618 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005619 PyObject *errorHandler = NULL;
5620 PyObject *exc = NULL;
5621 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005622
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005623 if (!PyUnicode_Check(str)) {
5624 PyErr_BadArgument();
5625 return NULL;
5626 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005627 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005628 return NULL;
5629 kind = PyUnicode_KIND(str);
5630 data = PyUnicode_DATA(str);
5631 len = PyUnicode_GET_LENGTH(str);
5632
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005633 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005634 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005635 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005636 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005637 if (v == NULL)
5638 return NULL;
5639
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005640 /* output buffer is 4-bytes aligned */
5641 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005642 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005643 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005645 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005647
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005650 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005652 else
5653 encoding = "utf-32";
5654
5655 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005656 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5657 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005658 }
5659
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005660 pos = 0;
5661 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663
5664 if (kind == PyUnicode_2BYTE_KIND) {
5665 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5666 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005668 else {
5669 assert(kind == PyUnicode_4BYTE_KIND);
5670 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5671 &out, native_ordering);
5672 }
5673 if (pos == len)
5674 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005675
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005676 rep = unicode_encode_call_errorhandler(
5677 errors, &errorHandler,
5678 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005679 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 if (!rep)
5681 goto error;
5682
5683 if (PyBytes_Check(rep)) {
5684 repsize = PyBytes_GET_SIZE(rep);
5685 if (repsize & 3) {
5686 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005687 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005688 "surrogates not allowed");
5689 goto error;
5690 }
5691 moreunits = repsize / 4;
5692 }
5693 else {
5694 assert(PyUnicode_Check(rep));
5695 if (PyUnicode_READY(rep) < 0)
5696 goto error;
5697 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5698 if (!PyUnicode_IS_ASCII(rep)) {
5699 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 "surrogates not allowed");
5702 goto error;
5703 }
5704 }
5705
5706 /* four bytes are reserved for each surrogate */
5707 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005708 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005709 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 /* integer overflow */
5711 PyErr_NoMemory();
5712 goto error;
5713 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005714 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005716 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005717 }
5718
5719 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005720 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005721 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005723 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005724 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5725 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 }
5727
5728 Py_CLEAR(rep);
5729 }
5730
5731 /* Cut back to size actually needed. This is necessary for, for example,
5732 encoding of a string containing isolated surrogates and the 'ignore'
5733 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005734 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005735 if (nsize != PyBytes_GET_SIZE(v))
5736 _PyBytes_Resize(&v, nsize);
5737 Py_XDECREF(errorHandler);
5738 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005740 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005741 error:
5742 Py_XDECREF(rep);
5743 Py_XDECREF(errorHandler);
5744 Py_XDECREF(exc);
5745 Py_XDECREF(v);
5746 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005747}
5748
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5751 Py_ssize_t size,
5752 const char *errors,
5753 int byteorder)
5754{
5755 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005756 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 if (tmp == NULL)
5758 return NULL;
5759 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5760 Py_DECREF(tmp);
5761 return result;
5762}
5763
5764PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005765PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005766{
Victor Stinnerb960b342011-11-20 19:12:52 +01005767 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005768}
5769
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770/* --- UTF-16 Codec ------------------------------------------------------- */
5771
Tim Peters772747b2001-08-09 22:21:55 +00005772PyObject *
5773PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 Py_ssize_t size,
5775 const char *errors,
5776 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777{
Walter Dörwald69652032004-09-07 20:24:22 +00005778 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5779}
5780
5781PyObject *
5782PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 Py_ssize_t size,
5784 const char *errors,
5785 int *byteorder,
5786 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t startinpos;
5790 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005791 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005792 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005793 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005794 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005795 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 PyObject *errorHandler = NULL;
5797 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005798 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Tim Peters772747b2001-08-09 22:21:55 +00005800 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005801 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
5803 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005804 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005806 /* Check for BOM marks (U+FEFF) in the input and adjust current
5807 byte order setting accordingly. In native mode, the leading BOM
5808 mark is skipped, in all other modes, it is copied to the output
5809 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005810 if (bo == 0 && size >= 2) {
5811 const Py_UCS4 bom = (q[1] << 8) | q[0];
5812 if (bom == 0xFEFF) {
5813 q += 2;
5814 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005816 else if (bom == 0xFFFE) {
5817 q += 2;
5818 bo = 1;
5819 }
5820 if (byteorder)
5821 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
Antoine Pitrou63065d72012-05-15 23:48:04 +02005824 if (q == e) {
5825 if (consumed)
5826 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005827 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005828 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829
Christian Heimes743e0cd2012-10-17 23:52:17 +02005830#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005831 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005832 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005833#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005834 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005836#endif
Tim Peters772747b2001-08-09 22:21:55 +00005837
Antoine Pitrou63065d72012-05-15 23:48:04 +02005838 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005839 character count normally. Error handler will take care of
5840 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005841 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005842 writer.min_length = (e - q + 1) / 2;
5843 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005844 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005845
Antoine Pitrou63065d72012-05-15 23:48:04 +02005846 while (1) {
5847 Py_UCS4 ch = 0;
5848 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005852 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005854 native_ordering);
5855 else
5856 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005858 native_ordering);
5859 } else if (kind == PyUnicode_2BYTE_KIND) {
5860 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005862 native_ordering);
5863 } else {
5864 assert(kind == PyUnicode_4BYTE_KIND);
5865 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005867 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005868 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005869 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870
Antoine Pitrou63065d72012-05-15 23:48:04 +02005871 switch (ch)
5872 {
5873 case 0:
5874 /* remaining byte at the end? (size should be even) */
5875 if (q == e || consumed)
5876 goto End;
5877 errmsg = "truncated data";
5878 startinpos = ((const char *)q) - starts;
5879 endinpos = ((const char *)e) - starts;
5880 break;
5881 /* The remaining input chars are ignored if the callback
5882 chooses to skip the input */
5883 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005884 q -= 2;
5885 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005886 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005888 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005889 endinpos = ((const char *)e) - starts;
5890 break;
5891 case 2:
5892 errmsg = "illegal encoding";
5893 startinpos = ((const char *)q) - 2 - starts;
5894 endinpos = startinpos + 2;
5895 break;
5896 case 3:
5897 errmsg = "illegal UTF-16 surrogate";
5898 startinpos = ((const char *)q) - 4 - starts;
5899 endinpos = startinpos + 2;
5900 break;
5901 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005902 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005903 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 continue;
5905 }
5906
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005907 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005908 errors,
5909 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005910 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005911 &starts,
5912 (const char **)&e,
5913 &startinpos,
5914 &endinpos,
5915 &exc,
5916 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005917 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
5920
Antoine Pitrou63065d72012-05-15 23:48:04 +02005921End:
Walter Dörwald69652032004-09-07 20:24:22 +00005922 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 Py_XDECREF(errorHandler);
5926 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005927 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005930 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 Py_XDECREF(errorHandler);
5932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 return NULL;
5934}
5935
Tim Peters772747b2001-08-09 22:21:55 +00005936PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937_PyUnicode_EncodeUTF16(PyObject *str,
5938 const char *errors,
5939 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005941 enum PyUnicode_Kind kind;
5942 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005945 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005946 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005947#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005948 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005949#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005950 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005951#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005952 const char *encoding;
5953 Py_ssize_t nsize, pos;
5954 PyObject *errorHandler = NULL;
5955 PyObject *exc = NULL;
5956 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005957
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005958 if (!PyUnicode_Check(str)) {
5959 PyErr_BadArgument();
5960 return NULL;
5961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005962 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 return NULL;
5964 kind = PyUnicode_KIND(str);
5965 data = PyUnicode_DATA(str);
5966 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005967
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005968 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005969 if (kind == PyUnicode_4BYTE_KIND) {
5970 const Py_UCS4 *in = (const Py_UCS4 *)data;
5971 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005972 while (in < end) {
5973 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005974 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005975 }
5976 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005977 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005978 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005980 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005981 nsize = len + pairs + (byteorder == 0);
5982 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005983 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005988 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005990 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005991 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005992 }
5993 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005994 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005995 }
Tim Peters772747b2001-08-09 22:21:55 +00005996
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005997 if (kind == PyUnicode_1BYTE_KIND) {
5998 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5999 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006000 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006001
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006002 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006004 }
6005 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006006 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006007 }
6008 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006010 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006011
6012 pos = 0;
6013 while (pos < len) {
6014 Py_ssize_t repsize, moreunits;
6015
6016 if (kind == PyUnicode_2BYTE_KIND) {
6017 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6018 &out, native_ordering);
6019 }
6020 else {
6021 assert(kind == PyUnicode_4BYTE_KIND);
6022 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6023 &out, native_ordering);
6024 }
6025 if (pos == len)
6026 break;
6027
6028 rep = unicode_encode_call_errorhandler(
6029 errors, &errorHandler,
6030 encoding, "surrogates not allowed",
6031 str, &exc, pos, pos + 1, &pos);
6032 if (!rep)
6033 goto error;
6034
6035 if (PyBytes_Check(rep)) {
6036 repsize = PyBytes_GET_SIZE(rep);
6037 if (repsize & 1) {
6038 raise_encode_exception(&exc, encoding,
6039 str, pos - 1, pos,
6040 "surrogates not allowed");
6041 goto error;
6042 }
6043 moreunits = repsize / 2;
6044 }
6045 else {
6046 assert(PyUnicode_Check(rep));
6047 if (PyUnicode_READY(rep) < 0)
6048 goto error;
6049 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6050 if (!PyUnicode_IS_ASCII(rep)) {
6051 raise_encode_exception(&exc, encoding,
6052 str, pos - 1, pos,
6053 "surrogates not allowed");
6054 goto error;
6055 }
6056 }
6057
6058 /* two bytes are reserved for each surrogate */
6059 if (moreunits > 1) {
6060 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006061 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006062 /* integer overflow */
6063 PyErr_NoMemory();
6064 goto error;
6065 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006066 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006067 goto error;
6068 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6069 }
6070
6071 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006072 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006073 out += moreunits;
6074 } else /* rep is unicode */ {
6075 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6076 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6077 &out, native_ordering);
6078 }
6079
6080 Py_CLEAR(rep);
6081 }
6082
6083 /* Cut back to size actually needed. This is necessary for, for example,
6084 encoding of a string containing isolated surrogates and the 'ignore' handler
6085 is used. */
6086 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6087 if (nsize != PyBytes_GET_SIZE(v))
6088 _PyBytes_Resize(&v, nsize);
6089 Py_XDECREF(errorHandler);
6090 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006091 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006092 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006093 error:
6094 Py_XDECREF(rep);
6095 Py_XDECREF(errorHandler);
6096 Py_XDECREF(exc);
6097 Py_XDECREF(v);
6098 return NULL;
6099#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100}
6101
Alexander Belopolsky40018472011-02-26 01:02:56 +00006102PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006103PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6104 Py_ssize_t size,
6105 const char *errors,
6106 int byteorder)
6107{
6108 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006109 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006110 if (tmp == NULL)
6111 return NULL;
6112 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6113 Py_DECREF(tmp);
6114 return result;
6115}
6116
6117PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006118PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006120 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121}
6122
6123/* --- Unicode Escape Codec ----------------------------------------------- */
6124
Fredrik Lundh06d12682001-01-24 07:59:11 +00006125static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006128_PyUnicode_DecodeUnicodeEscape(const char *s,
6129 Py_ssize_t size,
6130 const char *errors,
6131 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 PyObject *errorHandler = NULL;
6137 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006138
Eric V. Smith42454af2016-10-31 09:22:08 -04006139 // so we can remember if we've seen an invalid escape char or not
6140 *first_invalid_escape = NULL;
6141
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006143 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 }
6145 /* Escaped strings will always be longer than the resulting
6146 Unicode string, so we start with size here and then reduce the
6147 length after conversion to the true value.
6148 (but if the error callback returns a long replacement string
6149 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006150 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006151 writer.min_length = size;
6152 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6153 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006154 }
6155
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 end = s + size;
6157 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 unsigned char c = (unsigned char) *s++;
6159 Py_UCS4 ch;
6160 int count;
6161 Py_ssize_t startinpos;
6162 Py_ssize_t endinpos;
6163 const char *message;
6164
6165#define WRITE_ASCII_CHAR(ch) \
6166 do { \
6167 assert(ch <= 127); \
6168 assert(writer.pos < writer.size); \
6169 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6170 } while(0)
6171
6172#define WRITE_CHAR(ch) \
6173 do { \
6174 if (ch <= writer.maxchar) { \
6175 assert(writer.pos < writer.size); \
6176 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6177 } \
6178 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6179 goto onError; \
6180 } \
6181 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
6183 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006184 if (c != '\\') {
6185 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 continue;
6187 }
6188
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 if (s >= end) {
6192 message = "\\ at end of string";
6193 goto error;
6194 }
6195 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006198 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 case '\n': continue;
6202 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6203 case '\'': WRITE_ASCII_CHAR('\''); continue;
6204 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6205 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006206 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006207 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6208 case 't': WRITE_ASCII_CHAR('\t'); continue;
6209 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6210 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006211 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006213 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 case '0': case '1': case '2': case '3':
6218 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006220 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 ch = (ch<<3) + *s++ - '0';
6222 if (s < end && '0' <= *s && *s <= '7') {
6223 ch = (ch<<3) + *s++ - '0';
6224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 WRITE_CHAR(ch);
6227 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 /* hex escapes */
6230 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006233 message = "truncated \\xXX escape";
6234 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006239 message = "truncated \\uXXXX escape";
6240 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006243 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006245 message = "truncated \\UXXXXXXXX escape";
6246 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006248 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 ch <<= 4;
6250 if (c >= '0' && c <= '9') {
6251 ch += c - '0';
6252 }
6253 else if (c >= 'a' && c <= 'f') {
6254 ch += c - ('a' - 10);
6255 }
6256 else if (c >= 'A' && c <= 'F') {
6257 ch += c - ('A' - 10);
6258 }
6259 else {
6260 break;
6261 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006262 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006264 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 }
6266
6267 /* when we get here, ch is a 32-bit unicode character */
6268 if (ch > MAX_UNICODE) {
6269 message = "illegal Unicode character";
6270 goto error;
6271 }
6272
6273 WRITE_CHAR(ch);
6274 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006277 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006278 if (ucnhash_CAPI == NULL) {
6279 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006280 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6281 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 if (ucnhash_CAPI == NULL) {
6283 PyErr_SetString(
6284 PyExc_UnicodeError,
6285 "\\N escapes not supported (can't load unicodedata module)"
6286 );
6287 goto onError;
6288 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006289 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006290
6291 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006292 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 const char *start = ++s;
6294 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006295 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006297 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 namelen = s - start;
6299 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006300 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006301 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 ch = 0xffffffff; /* in case 'getcode' messes up */
6303 if (namelen <= INT_MAX &&
6304 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6305 &ch, 0)) {
6306 assert(ch <= MAX_UNICODE);
6307 WRITE_CHAR(ch);
6308 continue;
6309 }
6310 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006311 }
6312 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006313 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006314
6315 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006316 if (*first_invalid_escape == NULL) {
6317 *first_invalid_escape = s-1; /* Back up one char, since we've
6318 already incremented s. */
6319 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 WRITE_ASCII_CHAR('\\');
6321 WRITE_CHAR(c);
6322 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006324
6325 error:
6326 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006328 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006329 errors, &errorHandler,
6330 "unicodeescape", message,
6331 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006333 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006335 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006336
6337#undef WRITE_ASCII_CHAR
6338#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006340
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006341 Py_XDECREF(errorHandler);
6342 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006344
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006346 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 Py_XDECREF(errorHandler);
6348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 return NULL;
6350}
6351
Eric V. Smith42454af2016-10-31 09:22:08 -04006352PyObject *
6353PyUnicode_DecodeUnicodeEscape(const char *s,
6354 Py_ssize_t size,
6355 const char *errors)
6356{
6357 const char *first_invalid_escape;
6358 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6359 &first_invalid_escape);
6360 if (result == NULL)
6361 return NULL;
6362 if (first_invalid_escape != NULL) {
6363 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6364 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006365 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006366 Py_DECREF(result);
6367 return NULL;
6368 }
6369 }
6370 return result;
6371}
6372
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006373/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
Alexander Belopolsky40018472011-02-26 01:02:56 +00006375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006378 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006382 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
Ezio Melottie7f90372012-10-05 03:33:31 +03006385 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006386 escape.
6387
Ezio Melottie7f90372012-10-05 03:33:31 +03006388 For UCS1 strings it's '\xxx', 4 bytes per source character.
6389 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6390 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006391 */
6392
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 if (!PyUnicode_Check(unicode)) {
6394 PyErr_BadArgument();
6395 return NULL;
6396 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 }
Victor Stinner358af132015-10-12 22:36:57 +02006400
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 if (len == 0) {
6403 return PyBytes_FromStringAndSize(NULL, 0);
6404 }
6405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 kind = PyUnicode_KIND(unicode);
6407 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6409 bytes, and 1 byte characters 4. */
6410 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006411 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 return PyErr_NoMemory();
6413 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006414 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 if (repr == NULL) {
6416 return NULL;
6417 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006421 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 /* U+0000-U+00ff range */
6424 if (ch < 0x100) {
6425 if (ch >= ' ' && ch < 127) {
6426 if (ch != '\\') {
6427 /* Copy printable US ASCII as-is */
6428 *p++ = (char) ch;
6429 }
6430 /* Escape backslashes */
6431 else {
6432 *p++ = '\\';
6433 *p++ = '\\';
6434 }
6435 }
Victor Stinner358af132015-10-12 22:36:57 +02006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 /* Map special whitespace to '\t', \n', '\r' */
6438 else if (ch == '\t') {
6439 *p++ = '\\';
6440 *p++ = 't';
6441 }
6442 else if (ch == '\n') {
6443 *p++ = '\\';
6444 *p++ = 'n';
6445 }
6446 else if (ch == '\r') {
6447 *p++ = '\\';
6448 *p++ = 'r';
6449 }
6450
6451 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6452 else {
6453 *p++ = '\\';
6454 *p++ = 'x';
6455 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6456 *p++ = Py_hexdigits[ch & 0x000F];
6457 }
Tim Petersced69f82003-09-16 20:30:58 +00006458 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006459 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 *p++ = '\\';
6462 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006463 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6464 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6465 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6466 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6469 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006470
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 /* Make sure that the first two digits are zero */
6472 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006473 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 *p++ = 'U';
6475 *p++ = '0';
6476 *p++ = '0';
6477 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6478 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6479 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6480 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6481 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6482 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 assert(p - PyBytes_AS_STRING(repr) > 0);
6487 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6488 return NULL;
6489 }
6490 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491}
6492
Alexander Belopolsky40018472011-02-26 01:02:56 +00006493PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6495 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006497 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006498 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006501 }
6502
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006503 result = PyUnicode_AsUnicodeEscapeString(tmp);
6504 Py_DECREF(tmp);
6505 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506}
6507
6508/* --- Raw Unicode Escape Codec ------------------------------------------- */
6509
Alexander Belopolsky40018472011-02-26 01:02:56 +00006510PyObject *
6511PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006512 Py_ssize_t size,
6513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006516 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 PyObject *errorHandler = NULL;
6519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006520
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006522 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 /* Escaped strings will always be longer than the resulting
6526 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 length after conversion to the true value. (But decoding error
6528 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006530 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6532 goto onError;
6533 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006534
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 end = s + size;
6536 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006537 unsigned char c = (unsigned char) *s++;
6538 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006539 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 Py_ssize_t startinpos;
6541 Py_ssize_t endinpos;
6542 const char *message;
6543
6544#define WRITE_CHAR(ch) \
6545 do { \
6546 if (ch <= writer.maxchar) { \
6547 assert(writer.pos < writer.size); \
6548 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6549 } \
6550 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6551 goto onError; \
6552 } \
6553 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006556 if (c != '\\' || s >= end) {
6557 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006560
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 c = (unsigned char) *s++;
6562 if (c == 'u') {
6563 count = 4;
6564 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 else if (c == 'U') {
6567 count = 8;
6568 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006569 }
6570 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 assert(writer.pos < writer.size);
6572 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6573 WRITE_CHAR(c);
6574 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006575 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 startinpos = s - starts - 2;
6577
6578 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6579 for (ch = 0; count && s < end; ++s, --count) {
6580 c = (unsigned char)*s;
6581 ch <<= 4;
6582 if (c >= '0' && c <= '9') {
6583 ch += c - '0';
6584 }
6585 else if (c >= 'a' && c <= 'f') {
6586 ch += c - ('a' - 10);
6587 }
6588 else if (c >= 'A' && c <= 'F') {
6589 ch += c - ('A' - 10);
6590 }
6591 else {
6592 break;
6593 }
6594 }
6595 if (!count) {
6596 if (ch <= MAX_UNICODE) {
6597 WRITE_CHAR(ch);
6598 continue;
6599 }
6600 message = "\\Uxxxxxxxx out of range";
6601 }
6602
6603 endinpos = s-starts;
6604 writer.min_length = end - s + writer.pos;
6605 if (unicode_decode_call_errorhandler_writer(
6606 errors, &errorHandler,
6607 "rawunicodeescape", message,
6608 &starts, &end, &startinpos, &endinpos, &exc, &s,
6609 &writer)) {
6610 goto onError;
6611 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006612 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006613
6614#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 Py_XDECREF(errorHandler);
6617 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006618 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006619
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006621 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 Py_XDECREF(errorHandler);
6623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626}
6627
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006630PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006634 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006635 int kind;
6636 void *data;
6637 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006639 if (!PyUnicode_Check(unicode)) {
6640 PyErr_BadArgument();
6641 return NULL;
6642 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 kind = PyUnicode_KIND(unicode);
6647 data = PyUnicode_DATA(unicode);
6648 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 if (kind == PyUnicode_1BYTE_KIND) {
6650 return PyBytes_FromStringAndSize(data, len);
6651 }
Victor Stinner0e368262011-11-10 20:12:49 +01006652
Victor Stinner62ec3312016-09-06 17:04:34 -07006653 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6654 bytes, and 1 byte characters 4. */
6655 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006656
Victor Stinner62ec3312016-09-06 17:04:34 -07006657 if (len > PY_SSIZE_T_MAX / expandsize) {
6658 return PyErr_NoMemory();
6659 }
6660 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6661 if (repr == NULL) {
6662 return NULL;
6663 }
6664 if (len == 0) {
6665 return repr;
6666 }
6667
6668 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006669 for (pos = 0; pos < len; pos++) {
6670 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006671
Victor Stinner62ec3312016-09-06 17:04:34 -07006672 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6673 if (ch < 0x100) {
6674 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006675 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006676 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006677 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 *p++ = '\\';
6679 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006680 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6681 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6682 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6683 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6686 else {
6687 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6688 *p++ = '\\';
6689 *p++ = 'U';
6690 *p++ = '0';
6691 *p++ = '0';
6692 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6693 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6694 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6695 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6696 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6697 *p++ = Py_hexdigits[ch & 15];
6698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006700
Victor Stinner62ec3312016-09-06 17:04:34 -07006701 assert(p > PyBytes_AS_STRING(repr));
6702 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6703 return NULL;
6704 }
6705 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706}
6707
Alexander Belopolsky40018472011-02-26 01:02:56 +00006708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006709PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6710 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006712 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006713 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006715 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006716 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6717 Py_DECREF(tmp);
6718 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719}
6720
6721/* --- Latin-1 Codec ------------------------------------------------------ */
6722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006725 Py_ssize_t size,
6726 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006729 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733static void
6734make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006735 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006736 PyObject *unicode,
6737 Py_ssize_t startpos, Py_ssize_t endpos,
6738 const char *reason)
6739{
6740 if (*exceptionObject == NULL) {
6741 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006743 encoding, unicode, startpos, endpos, reason);
6744 }
6745 else {
6746 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6747 goto onError;
6748 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6749 goto onError;
6750 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6751 goto onError;
6752 return;
6753 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006754 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006755 }
6756}
6757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759static void
6760raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006761 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006762 PyObject *unicode,
6763 Py_ssize_t startpos, Py_ssize_t endpos,
6764 const char *reason)
6765{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006766 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006767 encoding, unicode, startpos, endpos, reason);
6768 if (*exceptionObject != NULL)
6769 PyCodec_StrictErrors(*exceptionObject);
6770}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771
6772/* error handling callback helper:
6773 build arguments, call the callback and check the arguments,
6774 put the result into newpos and return the replacement string, which
6775 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006776static PyObject *
6777unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006778 PyObject **errorHandler,
6779 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006781 Py_ssize_t startpos, Py_ssize_t endpos,
6782 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006784 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 PyObject *restuple;
6787 PyObject *resunicode;
6788
6789 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793 }
6794
Benjamin Petersonbac79492012-01-14 13:34:47 -05006795 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 return NULL;
6797 len = PyUnicode_GET_LENGTH(unicode);
6798
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006799 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006800 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006804 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006807 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006808 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 Py_DECREF(restuple);
6810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006811 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006812 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 &resunicode, newpos)) {
6814 Py_DECREF(restuple);
6815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006817 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6818 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6819 Py_DECREF(restuple);
6820 return NULL;
6821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006823 *newpos = len + *newpos;
6824 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006825 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 Py_DECREF(restuple);
6827 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829 Py_INCREF(resunicode);
6830 Py_DECREF(restuple);
6831 return resunicode;
6832}
6833
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006836 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006837 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 /* input state */
6840 Py_ssize_t pos=0, size;
6841 int kind;
6842 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843 /* pointer into the output */
6844 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006845 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6846 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006847 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006849 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006850 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006851 /* output object */
6852 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853
Benjamin Petersonbac79492012-01-14 13:34:47 -05006854 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006855 return NULL;
6856 size = PyUnicode_GET_LENGTH(unicode);
6857 kind = PyUnicode_KIND(unicode);
6858 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 /* allocate enough for a simple encoding without
6860 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006861 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006862 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863
6864 _PyBytesWriter_Init(&writer);
6865 str = _PyBytesWriter_Alloc(&writer, size);
6866 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006867 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006870 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006873 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006875 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006877 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006879 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006884
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006885 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006887
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006888 /* Only overallocate the buffer if it's not the last write */
6889 writer.overallocate = (collend < size);
6890
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006892 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006893 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006894
6895 switch (error_handler) {
6896 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006897 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006899
6900 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006901 memset(str, '?', collend - collstart);
6902 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006903 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006904 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006905 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 break;
Victor Stinner50149202015-09-22 00:26:54 +02006907
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006908 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006909 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006910 writer.min_size -= (collend - collstart);
6911 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006912 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006913 if (str == NULL)
6914 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006915 pos = collend;
6916 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006917
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006918 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006919 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006920 writer.min_size -= (collend - collstart);
6921 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006922 unicode, collstart, collend);
6923 if (str == NULL)
6924 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006925 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 break;
Victor Stinner50149202015-09-22 00:26:54 +02006927
Victor Stinnerc3713e92015-09-29 12:32:13 +02006928 case _Py_ERROR_SURROGATEESCAPE:
6929 for (i = collstart; i < collend; ++i) {
6930 ch = PyUnicode_READ(kind, data, i);
6931 if (ch < 0xdc80 || 0xdcff < ch) {
6932 /* Not a UTF-8b surrogate */
6933 break;
6934 }
6935 *str++ = (char)(ch - 0xdc00);
6936 ++pos;
6937 }
6938 if (i >= collend)
6939 break;
6940 collstart = pos;
6941 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006942 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006943
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006945 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6946 encoding, reason, unicode, &exc,
6947 collstart, collend, &newpos);
6948 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006950
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006951 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006952 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006953
Victor Stinner6bd525b2015-10-09 13:10:05 +02006954 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006955 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006956 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006957 PyBytes_AS_STRING(rep),
6958 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006959 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006960 else {
6961 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006962
Victor Stinner6bd525b2015-10-09 13:10:05 +02006963 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006965
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006966 if (limit == 256 ?
6967 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6968 !PyUnicode_IS_ASCII(rep))
6969 {
6970 /* Not all characters are smaller than limit */
6971 raise_encode_exception(&exc, encoding, unicode,
6972 collstart, collend, reason);
6973 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006975 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6976 str = _PyBytesWriter_WriteBytes(&writer, str,
6977 PyUnicode_DATA(rep),
6978 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006980 if (str == NULL)
6981 goto onError;
6982
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006983 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006984 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006986
6987 /* If overallocation was disabled, ensure that it was the last
6988 write. Otherwise, we missed an optimization */
6989 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990 }
6991 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006992
Victor Stinner50149202015-09-22 00:26:54 +02006993 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006995 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006996
6997 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006998 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006999 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007000 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007001 Py_XDECREF(exc);
7002 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003}
7004
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007005/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007006PyObject *
7007PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007008 Py_ssize_t size,
7009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007012 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007013 if (unicode == NULL)
7014 return NULL;
7015 result = unicode_encode_ucs1(unicode, errors, 256);
7016 Py_DECREF(unicode);
7017 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018}
7019
Alexander Belopolsky40018472011-02-26 01:02:56 +00007020PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007021_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022{
7023 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 PyErr_BadArgument();
7025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007027 if (PyUnicode_READY(unicode) == -1)
7028 return NULL;
7029 /* Fast path: if it is a one-byte string, construct
7030 bytes object directly. */
7031 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7032 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7033 PyUnicode_GET_LENGTH(unicode));
7034 /* Non-Latin-1 characters present. Defer to above function to
7035 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007036 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037}
7038
7039PyObject*
7040PyUnicode_AsLatin1String(PyObject *unicode)
7041{
7042 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043}
7044
7045/* --- 7-bit ASCII Codec -------------------------------------------------- */
7046
Alexander Belopolsky40018472011-02-26 01:02:56 +00007047PyObject *
7048PyUnicode_DecodeASCII(const char *s,
7049 Py_ssize_t size,
7050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007053 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007054 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007056 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007057
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007059 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007060
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007062 if (size == 1 && (unsigned char)s[0] < 128)
7063 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007064
Inada Naoki770847a2019-06-24 12:30:24 +09007065 // Shortcut for simple case
7066 PyObject *u = PyUnicode_New(size, 127);
7067 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007068 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007069 }
7070 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7071 if (outpos == size) {
7072 return u;
7073 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007074
Inada Naoki770847a2019-06-24 12:30:24 +09007075 _PyUnicodeWriter writer;
7076 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007077 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007078
Inada Naoki770847a2019-06-24 12:30:24 +09007079 s += outpos;
7080 int kind = writer.kind;
7081 void *data = writer.data;
7082 Py_ssize_t startinpos, endinpos;
7083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007085 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007087 PyUnicode_WRITE(kind, data, writer.pos, c);
7088 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007090 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007092
7093 /* byte outsize range 0x00..0x7f: call the error handler */
7094
7095 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007096 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007097
7098 switch (error_handler)
7099 {
7100 case _Py_ERROR_REPLACE:
7101 case _Py_ERROR_SURROGATEESCAPE:
7102 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007103 but we may switch to UCS2 at the first write */
7104 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7105 goto onError;
7106 kind = writer.kind;
7107 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007108
7109 if (error_handler == _Py_ERROR_REPLACE)
7110 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7111 else
7112 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7113 writer.pos++;
7114 ++s;
7115 break;
7116
7117 case _Py_ERROR_IGNORE:
7118 ++s;
7119 break;
7120
7121 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 startinpos = s-starts;
7123 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007124 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007125 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 "ascii", "ordinal not in range(128)",
7127 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007128 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007130 kind = writer.kind;
7131 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007134 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007136 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007137
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007139 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007140 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007141 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return NULL;
7143}
7144
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007145/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007146PyObject *
7147PyUnicode_EncodeASCII(const Py_UNICODE *p,
7148 Py_ssize_t size,
7149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007151 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007152 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007153 if (unicode == NULL)
7154 return NULL;
7155 result = unicode_encode_ucs1(unicode, errors, 128);
7156 Py_DECREF(unicode);
7157 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158}
7159
Alexander Belopolsky40018472011-02-26 01:02:56 +00007160PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007161_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162{
7163 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 PyErr_BadArgument();
7165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007167 if (PyUnicode_READY(unicode) == -1)
7168 return NULL;
7169 /* Fast path: if it is an ASCII-only string, construct bytes object
7170 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007171 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7173 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007174 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007175}
7176
7177PyObject *
7178PyUnicode_AsASCIIString(PyObject *unicode)
7179{
7180 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181}
7182
Steve Dowercc16be82016-09-08 10:35:16 -07007183#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007184
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007185/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007186
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007187#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188#define NEED_RETRY
7189#endif
7190
Steve Dower7ebdda02019-08-21 16:22:33 -07007191/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7192 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7193 both cases also and avoids partial characters overrunning the
7194 length limit in MultiByteToWideChar on Windows */
7195#define DECODING_CHUNK_SIZE (INT_MAX/4)
7196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197#ifndef WC_ERR_INVALID_CHARS
7198# define WC_ERR_INVALID_CHARS 0x0080
7199#endif
7200
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007201static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007202code_page_name(UINT code_page, PyObject **obj)
7203{
7204 *obj = NULL;
7205 if (code_page == CP_ACP)
7206 return "mbcs";
7207 if (code_page == CP_UTF7)
7208 return "CP_UTF7";
7209 if (code_page == CP_UTF8)
7210 return "CP_UTF8";
7211
7212 *obj = PyBytes_FromFormat("cp%u", code_page);
7213 if (*obj == NULL)
7214 return NULL;
7215 return PyBytes_AS_STRING(*obj);
7216}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
Victor Stinner3a50e702011-10-18 21:21:00 +02007218static DWORD
7219decode_code_page_flags(UINT code_page)
7220{
7221 if (code_page == CP_UTF7) {
7222 /* The CP_UTF7 decoder only supports flags=0 */
7223 return 0;
7224 }
7225 else
7226 return MB_ERR_INVALID_CHARS;
7227}
7228
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007229/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 * Decode a byte string from a Windows code page into unicode object in strict
7231 * mode.
7232 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007233 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7234 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007236static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007237decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007238 wchar_t **buf,
7239 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 const char *in,
7241 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007243 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007244 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246
7247 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007249 while ((outsize = MultiByteToWideChar(code_page, flags,
7250 in, insize, NULL, 0)) <= 0)
7251 {
7252 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7253 goto error;
7254 }
7255 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7256 flags = 0;
7257 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007259 /* Extend a wchar_t* buffer */
7260 Py_ssize_t n = *bufsize; /* Get the current length */
7261 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7262 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007264 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265
7266 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7268 if (outsize <= 0)
7269 goto error;
7270 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007271
Victor Stinner3a50e702011-10-18 21:21:00 +02007272error:
7273 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7274 return -2;
7275 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007276 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007277}
7278
Victor Stinner3a50e702011-10-18 21:21:00 +02007279/*
7280 * Decode a byte string from a code page into unicode object with an error
7281 * handler.
7282 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007283 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 * UnicodeDecodeError exception and returns -1 on error.
7285 */
7286static int
7287decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007288 wchar_t **buf,
7289 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007290 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007291 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007292{
7293 const char *startin = in;
7294 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007295 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 /* Ideally, we should get reason from FormatMessage. This is the Windows
7297 2000 English version of the message. */
7298 const char *reason = "No mapping for the Unicode character exists "
7299 "in the target code page.";
7300 /* each step cannot decode more than 1 character, but a character can be
7301 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007302 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007303 int insize;
7304 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 PyObject *errorHandler = NULL;
7306 PyObject *exc = NULL;
7307 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007308 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 DWORD err;
7310 int ret = -1;
7311
7312 assert(size > 0);
7313
7314 encoding = code_page_name(code_page, &encoding_obj);
7315 if (encoding == NULL)
7316 return -1;
7317
Victor Stinner7d00cc12014-03-17 23:08:06 +01007318 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7320 UnicodeDecodeError. */
7321 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7322 if (exc != NULL) {
7323 PyCodec_StrictErrors(exc);
7324 Py_CLEAR(exc);
7325 }
7326 goto error;
7327 }
7328
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007329 /* Extend a wchar_t* buffer */
7330 Py_ssize_t n = *bufsize; /* Get the current length */
7331 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7332 PyErr_NoMemory();
7333 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007335 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7336 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007338 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007339
7340 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 while (in < endin)
7342 {
7343 /* Decode a character */
7344 insize = 1;
7345 do
7346 {
7347 outsize = MultiByteToWideChar(code_page, flags,
7348 in, insize,
7349 buffer, Py_ARRAY_LENGTH(buffer));
7350 if (outsize > 0)
7351 break;
7352 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007353 if (err == ERROR_INVALID_FLAGS && flags) {
7354 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7355 flags = 0;
7356 continue;
7357 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 if (err != ERROR_NO_UNICODE_TRANSLATION
7359 && err != ERROR_INSUFFICIENT_BUFFER)
7360 {
7361 PyErr_SetFromWindowsErr(0);
7362 goto error;
7363 }
7364 insize++;
7365 }
7366 /* 4=maximum length of a UTF-8 sequence */
7367 while (insize <= 4 && (in + insize) <= endin);
7368
7369 if (outsize <= 0) {
7370 Py_ssize_t startinpos, endinpos, outpos;
7371
Victor Stinner7d00cc12014-03-17 23:08:06 +01007372 /* last character in partial decode? */
7373 if (in + insize >= endin && !final)
7374 break;
7375
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 startinpos = in - startin;
7377 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007378 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007379 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 errors, &errorHandler,
7381 encoding, reason,
7382 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 {
7385 goto error;
7386 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007387 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 }
7389 else {
7390 in += insize;
7391 memcpy(out, buffer, outsize * sizeof(wchar_t));
7392 out += outsize;
7393 }
7394 }
7395
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 /* Shrink the buffer */
7397 assert(out - *buf <= *bufsize);
7398 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007399 /* (in - startin) <= size and size is an int */
7400 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007401
7402error:
7403 Py_XDECREF(encoding_obj);
7404 Py_XDECREF(errorHandler);
7405 Py_XDECREF(exc);
7406 return ret;
7407}
7408
Victor Stinner3a50e702011-10-18 21:21:00 +02007409static PyObject *
7410decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007411 const char *s, Py_ssize_t size,
7412 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007414 wchar_t *buf = NULL;
7415 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007416 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 if (code_page < 0) {
7419 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7420 return NULL;
7421 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007422 if (size < 0) {
7423 PyErr_BadInternalCall();
7424 return NULL;
7425 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007426
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429
Victor Stinner76a31a62011-11-04 00:05:13 +01007430 do
7431 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007433 if (size > DECODING_CHUNK_SIZE) {
7434 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007435 final = 0;
7436 done = 0;
7437 }
7438 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007439#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007440 {
7441 chunk_size = (int)size;
7442 final = (consumed == NULL);
7443 done = 1;
7444 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007445
Victor Stinner76a31a62011-11-04 00:05:13 +01007446 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007447 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007448 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007449 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007450 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007451
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007452 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 s, chunk_size);
7454 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007455 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007456 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007457 errors, final);
7458 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007459
7460 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007461 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 return NULL;
7463 }
7464
7465 if (consumed)
7466 *consumed += converted;
7467
7468 s += converted;
7469 size -= converted;
7470 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007471
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007472 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7473 PyMem_Free(buf);
7474 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475}
7476
Alexander Belopolsky40018472011-02-26 01:02:56 +00007477PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007478PyUnicode_DecodeCodePageStateful(int code_page,
7479 const char *s,
7480 Py_ssize_t size,
7481 const char *errors,
7482 Py_ssize_t *consumed)
7483{
7484 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7485}
7486
7487PyObject *
7488PyUnicode_DecodeMBCSStateful(const char *s,
7489 Py_ssize_t size,
7490 const char *errors,
7491 Py_ssize_t *consumed)
7492{
7493 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7494}
7495
7496PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497PyUnicode_DecodeMBCS(const char *s,
7498 Py_ssize_t size,
7499 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007500{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007501 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7502}
7503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504static DWORD
7505encode_code_page_flags(UINT code_page, const char *errors)
7506{
7507 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007508 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 }
7510 else if (code_page == CP_UTF7) {
7511 /* CP_UTF7 only supports flags=0 */
7512 return 0;
7513 }
7514 else {
7515 if (errors != NULL && strcmp(errors, "replace") == 0)
7516 return 0;
7517 else
7518 return WC_NO_BEST_FIT_CHARS;
7519 }
7520}
7521
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 * Encode a Unicode string to a Windows code page into a byte string in strict
7524 * mode.
7525 *
7526 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007527 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007528 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007529static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007530encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007533{
Victor Stinner554f3f02010-06-16 23:33:54 +00007534 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 BOOL *pusedDefaultChar = &usedDefaultChar;
7536 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007537 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007538 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 const DWORD flags = encode_code_page_flags(code_page, NULL);
7540 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 /* Create a substring so that we can get the UTF-16 representation
7542 of just the slice under consideration. */
7543 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544
Martin v. Löwis3d325192011-11-04 18:23:06 +01007545 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007546
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007548 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007550 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007551
Victor Stinner2fc507f2011-11-04 20:06:39 +01007552 substring = PyUnicode_Substring(unicode, offset, offset+len);
7553 if (substring == NULL)
7554 return -1;
7555 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7556 if (p == NULL) {
7557 Py_DECREF(substring);
7558 return -1;
7559 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007560 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007561
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007562 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007564 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 NULL, 0,
7566 NULL, pusedDefaultChar);
7567 if (outsize <= 0)
7568 goto error;
7569 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007570 if (pusedDefaultChar && *pusedDefaultChar) {
7571 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007573 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007574
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007578 if (*outbytes == NULL) {
7579 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007583 }
7584 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 const Py_ssize_t n = PyBytes_Size(*outbytes);
7587 if (outsize > PY_SSIZE_T_MAX - n) {
7588 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007589 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007592 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7593 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007595 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007597 }
7598
7599 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007601 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 out, outsize,
7603 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007604 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 if (outsize <= 0)
7606 goto error;
7607 if (pusedDefaultChar && *pusedDefaultChar)
7608 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007610
Victor Stinner3a50e702011-10-18 21:21:00 +02007611error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007612 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7614 return -2;
7615 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007616 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007617}
7618
Victor Stinner3a50e702011-10-18 21:21:00 +02007619/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007620 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 * error handler.
7622 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007623 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 * -1 on other error.
7625 */
7626static int
7627encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007630{
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007632 Py_ssize_t pos = unicode_offset;
7633 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 /* Ideally, we should get reason from FormatMessage. This is the Windows
7635 2000 English version of the message. */
7636 const char *reason = "invalid character";
7637 /* 4=maximum length of a UTF-8 sequence */
7638 char buffer[4];
7639 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7640 Py_ssize_t outsize;
7641 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 PyObject *errorHandler = NULL;
7643 PyObject *exc = NULL;
7644 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007645 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 PyObject *rep;
7648 int ret = -1;
7649
7650 assert(insize > 0);
7651
7652 encoding = code_page_name(code_page, &encoding_obj);
7653 if (encoding == NULL)
7654 return -1;
7655
7656 if (errors == NULL || strcmp(errors, "strict") == 0) {
7657 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7658 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007659 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 if (exc != NULL) {
7661 PyCodec_StrictErrors(exc);
7662 Py_DECREF(exc);
7663 }
7664 Py_XDECREF(encoding_obj);
7665 return -1;
7666 }
7667
7668 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7669 pusedDefaultChar = &usedDefaultChar;
7670 else
7671 pusedDefaultChar = NULL;
7672
7673 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7674 PyErr_NoMemory();
7675 goto error;
7676 }
7677 outsize = insize * Py_ARRAY_LENGTH(buffer);
7678
7679 if (*outbytes == NULL) {
7680 /* Create string object */
7681 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7682 if (*outbytes == NULL)
7683 goto error;
7684 out = PyBytes_AS_STRING(*outbytes);
7685 }
7686 else {
7687 /* Extend string object */
7688 Py_ssize_t n = PyBytes_Size(*outbytes);
7689 if (n > PY_SSIZE_T_MAX - outsize) {
7690 PyErr_NoMemory();
7691 goto error;
7692 }
7693 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7694 goto error;
7695 out = PyBytes_AS_STRING(*outbytes) + n;
7696 }
7697
7698 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007699 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007701 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7702 wchar_t chars[2];
7703 int charsize;
7704 if (ch < 0x10000) {
7705 chars[0] = (wchar_t)ch;
7706 charsize = 1;
7707 }
7708 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007709 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7710 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007711 charsize = 2;
7712 }
7713
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007715 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 buffer, Py_ARRAY_LENGTH(buffer),
7717 NULL, pusedDefaultChar);
7718 if (outsize > 0) {
7719 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7720 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 memcpy(out, buffer, outsize);
7723 out += outsize;
7724 continue;
7725 }
7726 }
7727 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7728 PyErr_SetFromWindowsErr(0);
7729 goto error;
7730 }
7731
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 rep = unicode_encode_call_errorhandler(
7733 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007735 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 if (rep == NULL)
7737 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007739
7740 if (PyBytes_Check(rep)) {
7741 outsize = PyBytes_GET_SIZE(rep);
7742 if (outsize != 1) {
7743 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7744 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7745 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7746 Py_DECREF(rep);
7747 goto error;
7748 }
7749 out = PyBytes_AS_STRING(*outbytes) + offset;
7750 }
7751 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7752 out += outsize;
7753 }
7754 else {
7755 Py_ssize_t i;
7756 enum PyUnicode_Kind kind;
7757 void *data;
7758
Benjamin Petersonbac79492012-01-14 13:34:47 -05007759 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 Py_DECREF(rep);
7761 goto error;
7762 }
7763
7764 outsize = PyUnicode_GET_LENGTH(rep);
7765 if (outsize != 1) {
7766 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7767 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7768 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7769 Py_DECREF(rep);
7770 goto error;
7771 }
7772 out = PyBytes_AS_STRING(*outbytes) + offset;
7773 }
7774 kind = PyUnicode_KIND(rep);
7775 data = PyUnicode_DATA(rep);
7776 for (i=0; i < outsize; i++) {
7777 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7778 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007779 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007780 encoding, unicode,
7781 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007782 "unable to encode error handler result to ASCII");
7783 Py_DECREF(rep);
7784 goto error;
7785 }
7786 *out = (unsigned char)ch;
7787 out++;
7788 }
7789 }
7790 Py_DECREF(rep);
7791 }
7792 /* write a NUL byte */
7793 *out = 0;
7794 outsize = out - PyBytes_AS_STRING(*outbytes);
7795 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7796 if (_PyBytes_Resize(outbytes, outsize) < 0)
7797 goto error;
7798 ret = 0;
7799
7800error:
7801 Py_XDECREF(encoding_obj);
7802 Py_XDECREF(errorHandler);
7803 Py_XDECREF(exc);
7804 return ret;
7805}
7806
Victor Stinner3a50e702011-10-18 21:21:00 +02007807static PyObject *
7808encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007810 const char *errors)
7811{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007812 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007813 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007814 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007815 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007816
Victor Stinner29dacf22015-01-26 16:41:32 +01007817 if (!PyUnicode_Check(unicode)) {
7818 PyErr_BadArgument();
7819 return NULL;
7820 }
7821
Benjamin Petersonbac79492012-01-14 13:34:47 -05007822 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007823 return NULL;
7824 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007825
Victor Stinner3a50e702011-10-18 21:21:00 +02007826 if (code_page < 0) {
7827 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7828 return NULL;
7829 }
7830
Martin v. Löwis3d325192011-11-04 18:23:06 +01007831 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007832 return PyBytes_FromStringAndSize(NULL, 0);
7833
Victor Stinner7581cef2011-11-03 22:32:33 +01007834 offset = 0;
7835 do
7836 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007837#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007838 if (len > DECODING_CHUNK_SIZE) {
7839 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007840 done = 0;
7841 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007842 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007843#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007844 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007845 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007846 done = 1;
7847 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007848
Victor Stinner76a31a62011-11-04 00:05:13 +01007849 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007850 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007851 errors);
7852 if (ret == -2)
7853 ret = encode_code_page_errors(code_page, &outbytes,
7854 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007855 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007856 if (ret < 0) {
7857 Py_XDECREF(outbytes);
7858 return NULL;
7859 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007860
Victor Stinner7581cef2011-11-03 22:32:33 +01007861 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007862 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007863 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007864
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 return outbytes;
7866}
7867
7868PyObject *
7869PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7870 Py_ssize_t size,
7871 const char *errors)
7872{
Victor Stinner7581cef2011-11-03 22:32:33 +01007873 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007874 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007875 if (unicode == NULL)
7876 return NULL;
7877 res = encode_code_page(CP_ACP, unicode, errors);
7878 Py_DECREF(unicode);
7879 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007880}
7881
7882PyObject *
7883PyUnicode_EncodeCodePage(int code_page,
7884 PyObject *unicode,
7885 const char *errors)
7886{
Victor Stinner7581cef2011-11-03 22:32:33 +01007887 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007888}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007889
Alexander Belopolsky40018472011-02-26 01:02:56 +00007890PyObject *
7891PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007892{
Victor Stinner7581cef2011-11-03 22:32:33 +01007893 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007894}
7895
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007896#undef NEED_RETRY
7897
Steve Dowercc16be82016-09-08 10:35:16 -07007898#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900/* --- Character Mapping Codec -------------------------------------------- */
7901
Victor Stinnerfb161b12013-04-18 01:44:27 +02007902static int
7903charmap_decode_string(const char *s,
7904 Py_ssize_t size,
7905 PyObject *mapping,
7906 const char *errors,
7907 _PyUnicodeWriter *writer)
7908{
7909 const char *starts = s;
7910 const char *e;
7911 Py_ssize_t startinpos, endinpos;
7912 PyObject *errorHandler = NULL, *exc = NULL;
7913 Py_ssize_t maplen;
7914 enum PyUnicode_Kind mapkind;
7915 void *mapdata;
7916 Py_UCS4 x;
7917 unsigned char ch;
7918
7919 if (PyUnicode_READY(mapping) == -1)
7920 return -1;
7921
7922 maplen = PyUnicode_GET_LENGTH(mapping);
7923 mapdata = PyUnicode_DATA(mapping);
7924 mapkind = PyUnicode_KIND(mapping);
7925
7926 e = s + size;
7927
7928 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7929 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7930 * is disabled in encoding aliases, latin1 is preferred because
7931 * its implementation is faster. */
7932 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7933 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7934 Py_UCS4 maxchar = writer->maxchar;
7935
7936 assert (writer->kind == PyUnicode_1BYTE_KIND);
7937 while (s < e) {
7938 ch = *s;
7939 x = mapdata_ucs1[ch];
7940 if (x > maxchar) {
7941 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7942 goto onError;
7943 maxchar = writer->maxchar;
7944 outdata = (Py_UCS1 *)writer->data;
7945 }
7946 outdata[writer->pos] = x;
7947 writer->pos++;
7948 ++s;
7949 }
7950 return 0;
7951 }
7952
7953 while (s < e) {
7954 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7955 enum PyUnicode_Kind outkind = writer->kind;
7956 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7957 if (outkind == PyUnicode_1BYTE_KIND) {
7958 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7959 Py_UCS4 maxchar = writer->maxchar;
7960 while (s < e) {
7961 ch = *s;
7962 x = mapdata_ucs2[ch];
7963 if (x > maxchar)
7964 goto Error;
7965 outdata[writer->pos] = x;
7966 writer->pos++;
7967 ++s;
7968 }
7969 break;
7970 }
7971 else if (outkind == PyUnicode_2BYTE_KIND) {
7972 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7973 while (s < e) {
7974 ch = *s;
7975 x = mapdata_ucs2[ch];
7976 if (x == 0xFFFE)
7977 goto Error;
7978 outdata[writer->pos] = x;
7979 writer->pos++;
7980 ++s;
7981 }
7982 break;
7983 }
7984 }
7985 ch = *s;
7986
7987 if (ch < maplen)
7988 x = PyUnicode_READ(mapkind, mapdata, ch);
7989 else
7990 x = 0xfffe; /* invalid value */
7991Error:
7992 if (x == 0xfffe)
7993 {
7994 /* undefined mapping */
7995 startinpos = s-starts;
7996 endinpos = startinpos+1;
7997 if (unicode_decode_call_errorhandler_writer(
7998 errors, &errorHandler,
7999 "charmap", "character maps to <undefined>",
8000 &starts, &e, &startinpos, &endinpos, &exc, &s,
8001 writer)) {
8002 goto onError;
8003 }
8004 continue;
8005 }
8006
8007 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8008 goto onError;
8009 ++s;
8010 }
8011 Py_XDECREF(errorHandler);
8012 Py_XDECREF(exc);
8013 return 0;
8014
8015onError:
8016 Py_XDECREF(errorHandler);
8017 Py_XDECREF(exc);
8018 return -1;
8019}
8020
8021static int
8022charmap_decode_mapping(const char *s,
8023 Py_ssize_t size,
8024 PyObject *mapping,
8025 const char *errors,
8026 _PyUnicodeWriter *writer)
8027{
8028 const char *starts = s;
8029 const char *e;
8030 Py_ssize_t startinpos, endinpos;
8031 PyObject *errorHandler = NULL, *exc = NULL;
8032 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008033 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034
8035 e = s + size;
8036
8037 while (s < e) {
8038 ch = *s;
8039
8040 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8041 key = PyLong_FromLong((long)ch);
8042 if (key == NULL)
8043 goto onError;
8044
8045 item = PyObject_GetItem(mapping, key);
8046 Py_DECREF(key);
8047 if (item == NULL) {
8048 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8049 /* No mapping found means: mapping is undefined. */
8050 PyErr_Clear();
8051 goto Undefined;
8052 } else
8053 goto onError;
8054 }
8055
8056 /* Apply mapping */
8057 if (item == Py_None)
8058 goto Undefined;
8059 if (PyLong_Check(item)) {
8060 long value = PyLong_AS_LONG(item);
8061 if (value == 0xFFFE)
8062 goto Undefined;
8063 if (value < 0 || value > MAX_UNICODE) {
8064 PyErr_Format(PyExc_TypeError,
8065 "character mapping must be in range(0x%lx)",
8066 (unsigned long)MAX_UNICODE + 1);
8067 goto onError;
8068 }
8069
8070 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8071 goto onError;
8072 }
8073 else if (PyUnicode_Check(item)) {
8074 if (PyUnicode_READY(item) == -1)
8075 goto onError;
8076 if (PyUnicode_GET_LENGTH(item) == 1) {
8077 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8078 if (value == 0xFFFE)
8079 goto Undefined;
8080 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8081 goto onError;
8082 }
8083 else {
8084 writer->overallocate = 1;
8085 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8086 goto onError;
8087 }
8088 }
8089 else {
8090 /* wrong return value */
8091 PyErr_SetString(PyExc_TypeError,
8092 "character mapping must return integer, None or str");
8093 goto onError;
8094 }
8095 Py_CLEAR(item);
8096 ++s;
8097 continue;
8098
8099Undefined:
8100 /* undefined mapping */
8101 Py_CLEAR(item);
8102 startinpos = s-starts;
8103 endinpos = startinpos+1;
8104 if (unicode_decode_call_errorhandler_writer(
8105 errors, &errorHandler,
8106 "charmap", "character maps to <undefined>",
8107 &starts, &e, &startinpos, &endinpos, &exc, &s,
8108 writer)) {
8109 goto onError;
8110 }
8111 }
8112 Py_XDECREF(errorHandler);
8113 Py_XDECREF(exc);
8114 return 0;
8115
8116onError:
8117 Py_XDECREF(item);
8118 Py_XDECREF(errorHandler);
8119 Py_XDECREF(exc);
8120 return -1;
8121}
8122
Alexander Belopolsky40018472011-02-26 01:02:56 +00008123PyObject *
8124PyUnicode_DecodeCharmap(const char *s,
8125 Py_ssize_t size,
8126 PyObject *mapping,
8127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008129 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008130
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 /* Default to Latin-1 */
8132 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008136 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008137 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008138 writer.min_length = size;
8139 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008141
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008142 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008143 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8144 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008145 }
8146 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008147 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8148 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008150 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008151
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008153 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 return NULL;
8155}
8156
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157/* Charmap encoding: the lookup table */
8158
Alexander Belopolsky40018472011-02-26 01:02:56 +00008159struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 PyObject_HEAD
8161 unsigned char level1[32];
8162 int count2, count3;
8163 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164};
8165
8166static PyObject*
8167encoding_map_size(PyObject *obj, PyObject* args)
8168{
8169 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172}
8173
8174static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 PyDoc_STR("Return the size (in bytes) of this object") },
8177 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178};
8179
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 "EncodingMap", /*tp_name*/
8183 sizeof(struct encoding_map), /*tp_basicsize*/
8184 0, /*tp_itemsize*/
8185 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008186 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008187 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 0, /*tp_getattr*/
8189 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008190 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 0, /*tp_repr*/
8192 0, /*tp_as_number*/
8193 0, /*tp_as_sequence*/
8194 0, /*tp_as_mapping*/
8195 0, /*tp_hash*/
8196 0, /*tp_call*/
8197 0, /*tp_str*/
8198 0, /*tp_getattro*/
8199 0, /*tp_setattro*/
8200 0, /*tp_as_buffer*/
8201 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8202 0, /*tp_doc*/
8203 0, /*tp_traverse*/
8204 0, /*tp_clear*/
8205 0, /*tp_richcompare*/
8206 0, /*tp_weaklistoffset*/
8207 0, /*tp_iter*/
8208 0, /*tp_iternext*/
8209 encoding_map_methods, /*tp_methods*/
8210 0, /*tp_members*/
8211 0, /*tp_getset*/
8212 0, /*tp_base*/
8213 0, /*tp_dict*/
8214 0, /*tp_descr_get*/
8215 0, /*tp_descr_set*/
8216 0, /*tp_dictoffset*/
8217 0, /*tp_init*/
8218 0, /*tp_alloc*/
8219 0, /*tp_new*/
8220 0, /*tp_free*/
8221 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222};
8223
8224PyObject*
8225PyUnicode_BuildEncodingMap(PyObject* string)
8226{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227 PyObject *result;
8228 struct encoding_map *mresult;
8229 int i;
8230 int need_dict = 0;
8231 unsigned char level1[32];
8232 unsigned char level2[512];
8233 unsigned char *mlevel1, *mlevel2, *mlevel3;
8234 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 int kind;
8236 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008237 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008240 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008241 PyErr_BadArgument();
8242 return NULL;
8243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 kind = PyUnicode_KIND(string);
8245 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008246 length = PyUnicode_GET_LENGTH(string);
8247 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248 memset(level1, 0xFF, sizeof level1);
8249 memset(level2, 0xFF, sizeof level2);
8250
8251 /* If there isn't a one-to-one mapping of NULL to \0,
8252 or if there are non-BMP characters, we need to use
8253 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008256 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 ch = PyUnicode_READ(kind, data, i);
8259 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260 need_dict = 1;
8261 break;
8262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 /* unmapped character */
8265 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 l1 = ch >> 11;
8267 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268 if (level1[l1] == 0xFF)
8269 level1[l1] = count2++;
8270 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272 }
8273
8274 if (count2 >= 0xFF || count3 >= 0xFF)
8275 need_dict = 1;
8276
8277 if (need_dict) {
8278 PyObject *result = PyDict_New();
8279 PyObject *key, *value;
8280 if (!result)
8281 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008282 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008284 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285 if (!key || !value)
8286 goto failed1;
8287 if (PyDict_SetItem(result, key, value) == -1)
8288 goto failed1;
8289 Py_DECREF(key);
8290 Py_DECREF(value);
8291 }
8292 return result;
8293 failed1:
8294 Py_XDECREF(key);
8295 Py_XDECREF(value);
8296 Py_DECREF(result);
8297 return NULL;
8298 }
8299
8300 /* Create a three-level trie */
8301 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8302 16*count2 + 128*count3 - 1);
8303 if (!result)
8304 return PyErr_NoMemory();
8305 PyObject_Init(result, &EncodingMapType);
8306 mresult = (struct encoding_map*)result;
8307 mresult->count2 = count2;
8308 mresult->count3 = count3;
8309 mlevel1 = mresult->level1;
8310 mlevel2 = mresult->level23;
8311 mlevel3 = mresult->level23 + 16*count2;
8312 memcpy(mlevel1, level1, 32);
8313 memset(mlevel2, 0xFF, 16*count2);
8314 memset(mlevel3, 0, 128*count3);
8315 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008316 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008318 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8319 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 /* unmapped character */
8321 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008322 o1 = ch>>11;
8323 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 i2 = 16*mlevel1[o1] + o2;
8325 if (mlevel2[i2] == 0xFF)
8326 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008327 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 i3 = 128*mlevel2[i2] + o3;
8329 mlevel3[i3] = i;
8330 }
8331 return result;
8332}
8333
8334static int
Victor Stinner22168992011-11-20 17:09:18 +01008335encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336{
8337 struct encoding_map *map = (struct encoding_map*)mapping;
8338 int l1 = c>>11;
8339 int l2 = (c>>7) & 0xF;
8340 int l3 = c & 0x7F;
8341 int i;
8342
Victor Stinner22168992011-11-20 17:09:18 +01008343 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 if (c == 0)
8346 return 0;
8347 /* level 1*/
8348 i = map->level1[l1];
8349 if (i == 0xFF) {
8350 return -1;
8351 }
8352 /* level 2*/
8353 i = map->level23[16*i+l2];
8354 if (i == 0xFF) {
8355 return -1;
8356 }
8357 /* level 3 */
8358 i = map->level23[16*map->count2 + 128*i + l3];
8359 if (i == 0) {
8360 return -1;
8361 }
8362 return i;
8363}
8364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365/* Lookup the character ch in the mapping. If the character
8366 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008367 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008369charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370{
Christian Heimes217cfd12007-12-02 14:31:20 +00008371 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 PyObject *x;
8373
8374 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 x = PyObject_GetItem(mapping, w);
8377 Py_DECREF(w);
8378 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8380 /* No mapping found means: mapping is undefined. */
8381 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008382 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 } else
8384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008386 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008388 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 long value = PyLong_AS_LONG(x);
8390 if (value < 0 || value > 255) {
8391 PyErr_SetString(PyExc_TypeError,
8392 "character mapping must be in range(256)");
8393 Py_DECREF(x);
8394 return NULL;
8395 }
8396 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008398 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 /* wrong return value */
8402 PyErr_Format(PyExc_TypeError,
8403 "character mapping must return integer, bytes or None, not %.400s",
8404 x->ob_type->tp_name);
8405 Py_DECREF(x);
8406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 }
8408}
8409
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008411charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8414 /* exponentially overallocate to minimize reallocations */
8415 if (requiredsize < 2*outsize)
8416 requiredsize = 2*outsize;
8417 if (_PyBytes_Resize(outobj, requiredsize))
8418 return -1;
8419 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008420}
8421
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008424} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008426 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 space is available. Return a new reference to the object that
8428 was put in the output buffer, or Py_None, if the mapping was undefined
8429 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008430 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008431static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008432charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008435 PyObject *rep;
8436 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008437 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438
Christian Heimes90aa7642007-12-19 02:45:37 +00008439 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008440 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008442 if (res == -1)
8443 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 if (outsize<requiredsize)
8445 if (charmapencode_resize(outobj, outpos, requiredsize))
8446 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008447 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 outstart[(*outpos)++] = (char)res;
8449 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008450 }
8451
8452 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 Py_DECREF(rep);
8457 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008458 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 if (PyLong_Check(rep)) {
8460 Py_ssize_t requiredsize = *outpos+1;
8461 if (outsize<requiredsize)
8462 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8463 Py_DECREF(rep);
8464 return enc_EXCEPTION;
8465 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008466 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 else {
8470 const char *repchars = PyBytes_AS_STRING(rep);
8471 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8472 Py_ssize_t requiredsize = *outpos+repsize;
8473 if (outsize<requiredsize)
8474 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8475 Py_DECREF(rep);
8476 return enc_EXCEPTION;
8477 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008478 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 memcpy(outstart + *outpos, repchars, repsize);
8480 *outpos += repsize;
8481 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008483 Py_DECREF(rep);
8484 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485}
8486
8487/* handle an error in PyUnicode_EncodeCharmap
8488 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008489static int
8490charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008493 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008494 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495{
8496 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008498 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008499 enum PyUnicode_Kind kind;
8500 void *data;
8501 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008503 Py_ssize_t collstartpos = *inpos;
8504 Py_ssize_t collendpos = *inpos+1;
8505 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008506 const char *encoding = "charmap";
8507 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008508 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008509 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008510 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511
Benjamin Petersonbac79492012-01-14 13:34:47 -05008512 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 return -1;
8514 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 /* find all unencodable characters */
8516 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008517 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008518 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008520 val = encoding_map_lookup(ch, mapping);
8521 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 break;
8523 ++collendpos;
8524 continue;
8525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008526
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008527 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8528 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 if (rep==NULL)
8530 return -1;
8531 else if (rep!=Py_None) {
8532 Py_DECREF(rep);
8533 break;
8534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008535 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
8538 /* cache callback name lookup
8539 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008540 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008541 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008542
8543 switch (*error_handler) {
8544 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008545 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008546 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008547
8548 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008549 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 x = charmapencode_output('?', mapping, res, respos);
8551 if (x==enc_EXCEPTION) {
8552 return -1;
8553 }
8554 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008555 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return -1;
8557 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008558 }
8559 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008560 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 *inpos = collendpos;
8562 break;
Victor Stinner50149202015-09-22 00:26:54 +02008563
8564 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008565 /* generate replacement (temporarily (mis)uses p) */
8566 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 char buffer[2+29+1+1];
8568 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 for (cp = buffer; *cp; ++cp) {
8571 x = charmapencode_output(*cp, mapping, res, respos);
8572 if (x==enc_EXCEPTION)
8573 return -1;
8574 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008575 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 return -1;
8577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 }
8579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008580 *inpos = collendpos;
8581 break;
Victor Stinner50149202015-09-22 00:26:54 +02008582
Benjamin Peterson14339b62009-01-31 16:36:08 +00008583 default:
Victor Stinner50149202015-09-22 00:26:54 +02008584 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008585 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008589 if (PyBytes_Check(repunicode)) {
8590 /* Directly copy bytes result to output. */
8591 Py_ssize_t outsize = PyBytes_Size(*res);
8592 Py_ssize_t requiredsize;
8593 repsize = PyBytes_Size(repunicode);
8594 requiredsize = *respos + repsize;
8595 if (requiredsize > outsize)
8596 /* Make room for all additional bytes. */
8597 if (charmapencode_resize(res, respos, requiredsize)) {
8598 Py_DECREF(repunicode);
8599 return -1;
8600 }
8601 memcpy(PyBytes_AsString(*res) + *respos,
8602 PyBytes_AsString(repunicode), repsize);
8603 *respos += repsize;
8604 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008605 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008606 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008608 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008609 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008610 Py_DECREF(repunicode);
8611 return -1;
8612 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008613 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008614 data = PyUnicode_DATA(repunicode);
8615 kind = PyUnicode_KIND(repunicode);
8616 for (index = 0; index < repsize; index++) {
8617 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8618 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008620 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return -1;
8622 }
8623 else if (x==enc_FAILED) {
8624 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008625 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 return -1;
8627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 }
8629 *inpos = newpos;
8630 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 }
8632 return 0;
8633}
8634
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008636_PyUnicode_EncodeCharmap(PyObject *unicode,
8637 PyObject *mapping,
8638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 /* output object */
8641 PyObject *res = NULL;
8642 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008643 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008644 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008646 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008647 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008649 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008650 void *data;
8651 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652
Benjamin Petersonbac79492012-01-14 13:34:47 -05008653 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008654 return NULL;
8655 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008656 data = PyUnicode_DATA(unicode);
8657 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008658
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 /* Default to Latin-1 */
8660 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008661 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 /* allocate enough for a simple encoding without
8664 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008665 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 if (res == NULL)
8667 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008668 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008672 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008674 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 if (x==enc_EXCEPTION) /* error */
8676 goto onError;
8677 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008678 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008680 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 &res, &respos)) {
8682 goto onError;
8683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 else
8686 /* done with this character => adjust input position */
8687 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008691 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008692 if (_PyBytes_Resize(&res, respos) < 0)
8693 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008696 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 return res;
8698
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 Py_XDECREF(res);
8701 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008702 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 return NULL;
8704}
8705
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706/* Deprecated */
8707PyObject *
8708PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8709 Py_ssize_t size,
8710 PyObject *mapping,
8711 const char *errors)
8712{
8713 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008714 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008715 if (unicode == NULL)
8716 return NULL;
8717 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8718 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008719 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008720}
8721
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722PyObject *
8723PyUnicode_AsCharmapString(PyObject *unicode,
8724 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725{
8726 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 PyErr_BadArgument();
8728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008730 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731}
8732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008734static void
8735make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008737 Py_ssize_t startpos, Py_ssize_t endpos,
8738 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 *exceptionObject = _PyUnicodeTranslateError_Create(
8742 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743 }
8744 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8746 goto onError;
8747 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8748 goto onError;
8749 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8750 goto onError;
8751 return;
8752 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008753 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 }
8755}
8756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757/* error handling callback helper:
8758 build arguments, call the callback and check the arguments,
8759 put the result into newpos and return the replacement string, which
8760 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static PyObject *
8762unicode_translate_call_errorhandler(const char *errors,
8763 PyObject **errorHandler,
8764 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008766 Py_ssize_t startpos, Py_ssize_t endpos,
8767 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008769 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008770
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008771 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 PyObject *restuple;
8773 PyObject *resunicode;
8774
8775 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779 }
8780
8781 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008786 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008790 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 Py_DECREF(restuple);
8792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008794 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 &resunicode, &i_newpos)) {
8796 Py_DECREF(restuple);
8797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008799 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008801 else
8802 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008804 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 Py_DECREF(restuple);
8806 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008807 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808 Py_INCREF(resunicode);
8809 Py_DECREF(restuple);
8810 return resunicode;
8811}
8812
8813/* Lookup the character ch in the mapping and put the result in result,
8814 which must be decrefed by the caller.
8815 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008816static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818{
Christian Heimes217cfd12007-12-02 14:31:20 +00008819 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 PyObject *x;
8821
8822 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824 x = PyObject_GetItem(mapping, w);
8825 Py_DECREF(w);
8826 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8828 /* No mapping found means: use 1:1 mapping. */
8829 PyErr_Clear();
8830 *result = NULL;
8831 return 0;
8832 } else
8833 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008834 }
8835 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 *result = x;
8837 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008841 if (value < 0 || value > MAX_UNICODE) {
8842 PyErr_Format(PyExc_ValueError,
8843 "character mapping must be in range(0x%x)",
8844 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 Py_DECREF(x);
8846 return -1;
8847 }
8848 *result = x;
8849 return 0;
8850 }
8851 else if (PyUnicode_Check(x)) {
8852 *result = x;
8853 return 0;
8854 }
8855 else {
8856 /* wrong return value */
8857 PyErr_SetString(PyExc_TypeError,
8858 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008859 Py_DECREF(x);
8860 return -1;
8861 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008862}
Victor Stinner1194ea02014-04-04 19:37:40 +02008863
8864/* lookup the character, write the result into the writer.
8865 Return 1 if the result was written into the writer, return 0 if the mapping
8866 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008867static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008868charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8869 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870{
Victor Stinner1194ea02014-04-04 19:37:40 +02008871 PyObject *item;
8872
8873 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008875
8876 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008878 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008881 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008883
8884 if (item == Py_None) {
8885 Py_DECREF(item);
8886 return 0;
8887 }
8888
8889 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008890 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8891 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8892 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008893 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8894 Py_DECREF(item);
8895 return -1;
8896 }
8897 Py_DECREF(item);
8898 return 1;
8899 }
8900
8901 if (!PyUnicode_Check(item)) {
8902 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008904 }
8905
8906 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8907 Py_DECREF(item);
8908 return -1;
8909 }
8910
8911 Py_DECREF(item);
8912 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913}
8914
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915static int
8916unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8917 Py_UCS1 *translate)
8918{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008919 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008920 int ret = 0;
8921
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922 if (charmaptranslate_lookup(ch, mapping, &item)) {
8923 return -1;
8924 }
8925
8926 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008927 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008928 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008930 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 /* not found => default to 1:1 mapping */
8932 translate[ch] = ch;
8933 return 1;
8934 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008935 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008936 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008937 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8938 used it */
8939 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 /* invalid character or character outside ASCII:
8941 skip the fast translate */
8942 goto exit;
8943 }
8944 translate[ch] = (Py_UCS1)replace;
8945 }
8946 else if (PyUnicode_Check(item)) {
8947 Py_UCS4 replace;
8948
8949 if (PyUnicode_READY(item) == -1) {
8950 Py_DECREF(item);
8951 return -1;
8952 }
8953 if (PyUnicode_GET_LENGTH(item) != 1)
8954 goto exit;
8955
8956 replace = PyUnicode_READ_CHAR(item, 0);
8957 if (replace > 127)
8958 goto exit;
8959 translate[ch] = (Py_UCS1)replace;
8960 }
8961 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008962 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008963 goto exit;
8964 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 ret = 1;
8966
Benjamin Peterson1365de72014-04-07 20:15:41 -04008967 exit:
8968 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 return ret;
8970}
8971
8972/* Fast path for ascii => ascii translation. Return 1 if the whole string
8973 was translated into writer, return 0 if the input string was partially
8974 translated into writer, raise an exception and return -1 on error. */
8975static int
8976unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008977 _PyUnicodeWriter *writer, int ignore,
8978 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979{
Victor Stinner872b2912014-04-05 14:27:07 +02008980 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008981 Py_ssize_t len;
8982 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008983 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008984
Victor Stinner89a76ab2014-04-05 11:44:04 +02008985 len = PyUnicode_GET_LENGTH(input);
8986
Victor Stinner872b2912014-04-05 14:27:07 +02008987 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008988
8989 in = PyUnicode_1BYTE_DATA(input);
8990 end = in + len;
8991
8992 assert(PyUnicode_IS_ASCII(writer->buffer));
8993 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8994 out = PyUnicode_1BYTE_DATA(writer->buffer);
8995
Victor Stinner872b2912014-04-05 14:27:07 +02008996 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008998 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008999 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009000 int translate = unicode_fast_translate_lookup(mapping, ch,
9001 ascii_table);
9002 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009003 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009004 if (translate == 0)
9005 goto exit;
9006 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009007 }
Victor Stinner872b2912014-04-05 14:27:07 +02009008 if (ch2 == 0xfe) {
9009 if (ignore)
9010 continue;
9011 goto exit;
9012 }
9013 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009015 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 }
Victor Stinner872b2912014-04-05 14:27:07 +02009017 res = 1;
9018
9019exit:
9020 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009021 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009022 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009023}
9024
Victor Stinner3222da22015-10-01 22:07:32 +02009025static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026_PyUnicode_TranslateCharmap(PyObject *input,
9027 PyObject *mapping,
9028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 Py_ssize_t size, i;
9033 int kind;
9034 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 _PyUnicodeWriter writer;
9036 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009037 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 PyObject *errorHandler = NULL;
9039 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009041 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 PyErr_BadArgument();
9045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 if (PyUnicode_READY(input) == -1)
9049 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009050 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 kind = PyUnicode_KIND(input);
9052 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009054 if (size == 0)
9055 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009057 /* allocate enough for a simple 1:1 translation without
9058 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 _PyUnicodeWriter_Init(&writer);
9060 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062
Victor Stinner872b2912014-04-05 14:27:07 +02009063 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9064
Victor Stinner33798672016-03-01 21:59:58 +01009065 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009066 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009067 if (PyUnicode_IS_ASCII(input)) {
9068 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9069 if (res < 0) {
9070 _PyUnicodeWriter_Dealloc(&writer);
9071 return NULL;
9072 }
9073 if (res == 1)
9074 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075 }
Victor Stinner33798672016-03-01 21:59:58 +01009076 else {
9077 i = 0;
9078 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009082 int translate;
9083 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9084 Py_ssize_t newpos;
9085 /* startpos for collecting untranslatable chars */
9086 Py_ssize_t collstart;
9087 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009088 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
Victor Stinner1194ea02014-04-04 19:37:40 +02009090 ch = PyUnicode_READ(kind, data, i);
9091 translate = charmaptranslate_output(ch, mapping, &writer);
9092 if (translate < 0)
9093 goto onError;
9094
9095 if (translate != 0) {
9096 /* it worked => adjust input pointer */
9097 ++i;
9098 continue;
9099 }
9100
9101 /* untranslatable character */
9102 collstart = i;
9103 collend = i+1;
9104
9105 /* find all untranslatable characters */
9106 while (collend < size) {
9107 PyObject *x;
9108 ch = PyUnicode_READ(kind, data, collend);
9109 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009110 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009111 Py_XDECREF(x);
9112 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009114 ++collend;
9115 }
9116
9117 if (ignore) {
9118 i = collend;
9119 }
9120 else {
9121 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9122 reason, input, &exc,
9123 collstart, collend, &newpos);
9124 if (repunicode == NULL)
9125 goto onError;
9126 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009128 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009130 Py_DECREF(repunicode);
9131 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009132 }
9133 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009134 Py_XDECREF(exc);
9135 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009136 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009139 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009140 Py_XDECREF(exc);
9141 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 return NULL;
9143}
9144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145/* Deprecated. Use PyUnicode_Translate instead. */
9146PyObject *
9147PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9148 Py_ssize_t size,
9149 PyObject *mapping,
9150 const char *errors)
9151{
Christian Heimes5f520f42012-09-11 14:03:25 +02009152 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009153 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 if (!unicode)
9155 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009156 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9157 Py_DECREF(unicode);
9158 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159}
9160
Alexander Belopolsky40018472011-02-26 01:02:56 +00009161PyObject *
9162PyUnicode_Translate(PyObject *str,
9163 PyObject *mapping,
9164 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009166 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009167 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009168 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169}
Tim Petersced69f82003-09-16 20:30:58 +00009170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171PyObject *
9172_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9173{
9174 if (!PyUnicode_Check(unicode)) {
9175 PyErr_BadInternalCall();
9176 return NULL;
9177 }
9178 if (PyUnicode_READY(unicode) == -1)
9179 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009180 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 /* If the string is already ASCII, just return the same string */
9182 Py_INCREF(unicode);
9183 return unicode;
9184 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009185
9186 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9187 PyObject *result = PyUnicode_New(len, 127);
9188 if (result == NULL) {
9189 return NULL;
9190 }
9191
9192 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9193 int kind = PyUnicode_KIND(unicode);
9194 const void *data = PyUnicode_DATA(unicode);
9195 Py_ssize_t i;
9196 for (i = 0; i < len; ++i) {
9197 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9198 if (ch < 127) {
9199 out[i] = ch;
9200 }
9201 else if (Py_UNICODE_ISSPACE(ch)) {
9202 out[i] = ' ';
9203 }
9204 else {
9205 int decimal = Py_UNICODE_TODECIMAL(ch);
9206 if (decimal < 0) {
9207 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009208 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009209 _PyUnicode_LENGTH(result) = i + 1;
9210 break;
9211 }
9212 out[i] = '0' + decimal;
9213 }
9214 }
9215
INADA Naoki16dfca42018-07-14 12:06:43 +09009216 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009217 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218}
9219
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009220PyObject *
9221PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9222 Py_ssize_t length)
9223{
Victor Stinnerf0124502011-11-21 23:12:56 +01009224 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009225 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009226 Py_UCS4 maxchar;
9227 enum PyUnicode_Kind kind;
9228 void *data;
9229
Victor Stinner99d7ad02012-02-22 13:37:39 +01009230 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009231 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009232 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009233 if (ch > 127) {
9234 int decimal = Py_UNICODE_TODECIMAL(ch);
9235 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009236 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009237 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009238 }
9239 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009240
9241 /* Copy to a new string */
9242 decimal = PyUnicode_New(length, maxchar);
9243 if (decimal == NULL)
9244 return decimal;
9245 kind = PyUnicode_KIND(decimal);
9246 data = PyUnicode_DATA(decimal);
9247 /* Iterate over code points */
9248 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009249 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009250 if (ch > 127) {
9251 int decimal = Py_UNICODE_TODECIMAL(ch);
9252 if (decimal >= 0)
9253 ch = '0' + decimal;
9254 }
9255 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009257 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009258}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009259/* --- Decimal Encoder ---------------------------------------------------- */
9260
Alexander Belopolsky40018472011-02-26 01:02:56 +00009261int
9262PyUnicode_EncodeDecimal(Py_UNICODE *s,
9263 Py_ssize_t length,
9264 char *output,
9265 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009266{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009267 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009268 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009269 enum PyUnicode_Kind kind;
9270 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009271
9272 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 PyErr_BadArgument();
9274 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009275 }
9276
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009277 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009278 if (unicode == NULL)
9279 return -1;
9280
Victor Stinner42bf7752011-11-21 22:52:58 +01009281 kind = PyUnicode_KIND(unicode);
9282 data = PyUnicode_DATA(unicode);
9283
Victor Stinnerb84d7232011-11-22 01:50:07 +01009284 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009285 PyObject *exc;
9286 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009287 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009288 Py_ssize_t startpos;
9289
9290 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009291
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009293 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009294 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009296 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 decimal = Py_UNICODE_TODECIMAL(ch);
9298 if (decimal >= 0) {
9299 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009300 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 continue;
9302 }
9303 if (0 < ch && ch < 256) {
9304 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009305 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 continue;
9307 }
Victor Stinner6345be92011-11-25 20:09:01 +01009308
Victor Stinner42bf7752011-11-21 22:52:58 +01009309 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009310 exc = NULL;
9311 raise_encode_exception(&exc, "decimal", unicode,
9312 startpos, startpos+1,
9313 "invalid decimal Unicode string");
9314 Py_XDECREF(exc);
9315 Py_DECREF(unicode);
9316 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009317 }
9318 /* 0-terminate the output string */
9319 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009320 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009321 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009322}
9323
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324/* --- Helpers ------------------------------------------------------------ */
9325
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009326/* helper macro to fixup start/end slice values */
9327#define ADJUST_INDICES(start, end, len) \
9328 if (end > len) \
9329 end = len; \
9330 else if (end < 0) { \
9331 end += len; \
9332 if (end < 0) \
9333 end = 0; \
9334 } \
9335 if (start < 0) { \
9336 start += len; \
9337 if (start < 0) \
9338 start = 0; \
9339 }
9340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009342any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009344 Py_ssize_t end,
9345 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009347 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 void *buf1, *buf2;
9349 Py_ssize_t len1, len2, result;
9350
9351 kind1 = PyUnicode_KIND(s1);
9352 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009353 if (kind1 < kind2)
9354 return -1;
9355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 len1 = PyUnicode_GET_LENGTH(s1);
9357 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009358 ADJUST_INDICES(start, end, len1);
9359 if (end - start < len2)
9360 return -1;
9361
9362 buf1 = PyUnicode_DATA(s1);
9363 buf2 = PyUnicode_DATA(s2);
9364 if (len2 == 1) {
9365 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9366 result = findchar((const char *)buf1 + kind1*start,
9367 kind1, end - start, ch, direction);
9368 if (result == -1)
9369 return -1;
9370 else
9371 return start + result;
9372 }
9373
9374 if (kind2 != kind1) {
9375 buf2 = _PyUnicode_AsKind(s2, kind1);
9376 if (!buf2)
9377 return -2;
9378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379
Victor Stinner794d5672011-10-10 03:21:36 +02009380 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009381 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009382 case PyUnicode_1BYTE_KIND:
9383 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9384 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9385 else
9386 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9387 break;
9388 case PyUnicode_2BYTE_KIND:
9389 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9390 break;
9391 case PyUnicode_4BYTE_KIND:
9392 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9393 break;
9394 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009395 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009396 }
9397 }
9398 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009399 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009400 case PyUnicode_1BYTE_KIND:
9401 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9402 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9403 else
9404 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9405 break;
9406 case PyUnicode_2BYTE_KIND:
9407 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9408 break;
9409 case PyUnicode_4BYTE_KIND:
9410 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9411 break;
9412 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009413 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 }
9416
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009417 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 PyMem_Free(buf2);
9419
9420 return result;
9421}
9422
Victor Stinner59423e32018-11-26 13:40:01 +01009423/* _PyUnicode_InsertThousandsGrouping() helper functions */
9424#include "stringlib/localeutil.h"
9425
9426/**
9427 * InsertThousandsGrouping:
9428 * @writer: Unicode writer.
9429 * @n_buffer: Number of characters in @buffer.
9430 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9431 * @d_pos: Start of digits string.
9432 * @n_digits: The number of digits in the string, in which we want
9433 * to put the grouping chars.
9434 * @min_width: The minimum width of the digits in the output string.
9435 * Output will be zero-padded on the left to fill.
9436 * @grouping: see definition in localeconv().
9437 * @thousands_sep: see definition in localeconv().
9438 *
9439 * There are 2 modes: counting and filling. If @writer is NULL,
9440 * we are in counting mode, else filling mode.
9441 * If counting, the required buffer size is returned.
9442 * If filling, we know the buffer will be large enough, so we don't
9443 * need to pass in the buffer size.
9444 * Inserts thousand grouping characters (as defined by grouping and
9445 * thousands_sep) into @writer.
9446 *
9447 * Return value: -1 on error, number of characters otherwise.
9448 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009450_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009451 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009452 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009453 PyObject *digits,
9454 Py_ssize_t d_pos,
9455 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009456 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009457 const char *grouping,
9458 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009459 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460{
Xtreak3f7983a2019-01-07 20:39:14 +05309461 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009462 if (writer) {
9463 assert(digits != NULL);
9464 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009465 }
9466 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009467 assert(digits == NULL);
9468 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009469 }
Victor Stinner59423e32018-11-26 13:40:01 +01009470 assert(0 <= d_pos);
9471 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009472 assert(grouping != NULL);
9473
9474 if (digits != NULL) {
9475 if (PyUnicode_READY(digits) == -1) {
9476 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009477 }
Victor Stinner59423e32018-11-26 13:40:01 +01009478 }
9479 if (PyUnicode_READY(thousands_sep) == -1) {
9480 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009481 }
9482
Victor Stinner59423e32018-11-26 13:40:01 +01009483 Py_ssize_t count = 0;
9484 Py_ssize_t n_zeros;
9485 int loop_broken = 0;
9486 int use_separator = 0; /* First time through, don't append the
9487 separator. They only go between
9488 groups. */
9489 Py_ssize_t buffer_pos;
9490 Py_ssize_t digits_pos;
9491 Py_ssize_t len;
9492 Py_ssize_t n_chars;
9493 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9494 be looked at */
9495 /* A generator that returns all of the grouping widths, until it
9496 returns 0. */
9497 GroupGenerator groupgen;
9498 GroupGenerator_init(&groupgen, grouping);
9499 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9500
9501 /* if digits are not grouped, thousands separator
9502 should be an empty string */
9503 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9504
9505 digits_pos = d_pos + n_digits;
9506 if (writer) {
9507 buffer_pos = writer->pos + n_buffer;
9508 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9509 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 }
Victor Stinner59423e32018-11-26 13:40:01 +01009511 else {
9512 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009513 }
Victor Stinner59423e32018-11-26 13:40:01 +01009514
9515 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009516 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009517 }
Victor Stinner59423e32018-11-26 13:40:01 +01009518
9519 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9520 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9521 n_zeros = Py_MAX(0, len - remaining);
9522 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9523
9524 /* Use n_zero zero's and n_chars chars */
9525
9526 /* Count only, don't do anything. */
9527 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9528
9529 /* Copy into the writer. */
9530 InsertThousandsGrouping_fill(writer, &buffer_pos,
9531 digits, &digits_pos,
9532 n_chars, n_zeros,
9533 use_separator ? thousands_sep : NULL,
9534 thousands_sep_len, maxchar);
9535
9536 /* Use a separator next time. */
9537 use_separator = 1;
9538
9539 remaining -= n_chars;
9540 min_width -= len;
9541
9542 if (remaining <= 0 && min_width <= 0) {
9543 loop_broken = 1;
9544 break;
9545 }
9546 min_width -= thousands_sep_len;
9547 }
9548 if (!loop_broken) {
9549 /* We left the loop without using a break statement. */
9550
9551 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9552 n_zeros = Py_MAX(0, len - remaining);
9553 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9554
9555 /* Use n_zero zero's and n_chars chars */
9556 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9557
9558 /* Copy into the writer. */
9559 InsertThousandsGrouping_fill(writer, &buffer_pos,
9560 digits, &digits_pos,
9561 n_chars, n_zeros,
9562 use_separator ? thousands_sep : NULL,
9563 thousands_sep_len, maxchar);
9564 }
9565 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566}
9567
9568
Alexander Belopolsky40018472011-02-26 01:02:56 +00009569Py_ssize_t
9570PyUnicode_Count(PyObject *str,
9571 PyObject *substr,
9572 Py_ssize_t start,
9573 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009575 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009576 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 void *buf1 = NULL, *buf2 = NULL;
9578 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009580 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009582
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009583 kind1 = PyUnicode_KIND(str);
9584 kind2 = PyUnicode_KIND(substr);
9585 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009586 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 len1 = PyUnicode_GET_LENGTH(str);
9589 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009591 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009593
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009594 buf1 = PyUnicode_DATA(str);
9595 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009597 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009598 if (!buf2)
9599 goto onError;
9600 }
9601
9602 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009604 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009605 result = asciilib_count(
9606 ((Py_UCS1*)buf1) + start, end - start,
9607 buf2, len2, PY_SSIZE_T_MAX
9608 );
9609 else
9610 result = ucs1lib_count(
9611 ((Py_UCS1*)buf1) + start, end - start,
9612 buf2, len2, PY_SSIZE_T_MAX
9613 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 break;
9615 case PyUnicode_2BYTE_KIND:
9616 result = ucs2lib_count(
9617 ((Py_UCS2*)buf1) + start, end - start,
9618 buf2, len2, PY_SSIZE_T_MAX
9619 );
9620 break;
9621 case PyUnicode_4BYTE_KIND:
9622 result = ucs4lib_count(
9623 ((Py_UCS4*)buf1) + start, end - start,
9624 buf2, len2, PY_SSIZE_T_MAX
9625 );
9626 break;
9627 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009628 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009630
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009631 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 PyMem_Free(buf2);
9633
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009636 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 PyMem_Free(buf2);
9638 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639}
9640
Alexander Belopolsky40018472011-02-26 01:02:56 +00009641Py_ssize_t
9642PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009643 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009644 Py_ssize_t start,
9645 Py_ssize_t end,
9646 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009651 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654Py_ssize_t
9655PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9656 Py_ssize_t start, Py_ssize_t end,
9657 int direction)
9658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009660 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 if (PyUnicode_READY(str) == -1)
9662 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009663 len = PyUnicode_GET_LENGTH(str);
9664 ADJUST_INDICES(start, end, len);
9665 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009666 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009668 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9669 kind, end-start, ch, direction);
9670 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009672 else
9673 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674}
9675
Alexander Belopolsky40018472011-02-26 01:02:56 +00009676static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009677tailmatch(PyObject *self,
9678 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009679 Py_ssize_t start,
9680 Py_ssize_t end,
9681 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 int kind_self;
9684 int kind_sub;
9685 void *data_self;
9686 void *data_sub;
9687 Py_ssize_t offset;
9688 Py_ssize_t i;
9689 Py_ssize_t end_sub;
9690
9691 if (PyUnicode_READY(self) == -1 ||
9692 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009693 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9696 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009700 if (PyUnicode_GET_LENGTH(substring) == 0)
9701 return 1;
9702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 kind_self = PyUnicode_KIND(self);
9704 data_self = PyUnicode_DATA(self);
9705 kind_sub = PyUnicode_KIND(substring);
9706 data_sub = PyUnicode_DATA(substring);
9707 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9708
9709 if (direction > 0)
9710 offset = end;
9711 else
9712 offset = start;
9713
9714 if (PyUnicode_READ(kind_self, data_self, offset) ==
9715 PyUnicode_READ(kind_sub, data_sub, 0) &&
9716 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9717 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9718 /* If both are of the same kind, memcmp is sufficient */
9719 if (kind_self == kind_sub) {
9720 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009721 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 data_sub,
9723 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009724 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009726 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 else {
9728 /* We do not need to compare 0 and len(substring)-1 because
9729 the if statement above ensured already that they are equal
9730 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 for (i = 1; i < end_sub; ++i) {
9732 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9733 PyUnicode_READ(kind_sub, data_sub, i))
9734 return 0;
9735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 }
9739
9740 return 0;
9741}
9742
Alexander Belopolsky40018472011-02-26 01:02:56 +00009743Py_ssize_t
9744PyUnicode_Tailmatch(PyObject *str,
9745 PyObject *substr,
9746 Py_ssize_t start,
9747 Py_ssize_t end,
9748 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009750 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009753 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754}
9755
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756static PyObject *
9757ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9760 char *resdata, *data = PyUnicode_DATA(self);
9761 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009762
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 res = PyUnicode_New(len, 127);
9764 if (res == NULL)
9765 return NULL;
9766 resdata = PyUnicode_DATA(res);
9767 if (lower)
9768 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770 _Py_bytes_upper(resdata, data, len);
9771 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772}
9773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009775handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 Py_ssize_t j;
9778 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009779 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009780 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009781
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9783
9784 where ! is a negation and \p{xxx} is a character with property xxx.
9785 */
9786 for (j = i - 1; j >= 0; j--) {
9787 c = PyUnicode_READ(kind, data, j);
9788 if (!_PyUnicode_IsCaseIgnorable(c))
9789 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009791 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9792 if (final_sigma) {
9793 for (j = i + 1; j < length; j++) {
9794 c = PyUnicode_READ(kind, data, j);
9795 if (!_PyUnicode_IsCaseIgnorable(c))
9796 break;
9797 }
9798 final_sigma = j == length || !_PyUnicode_IsCased(c);
9799 }
9800 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801}
9802
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009803static int
9804lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9805 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009807 /* Obscure special case. */
9808 if (c == 0x3A3) {
9809 mapped[0] = handle_capital_sigma(kind, data, length, i);
9810 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813}
9814
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009815static Py_ssize_t
9816do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 Py_ssize_t i, k = 0;
9819 int n_res, j;
9820 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009823 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009825 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828 for (i = 1; i < length; i++) {
9829 c = PyUnicode_READ(kind, data, i);
9830 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9831 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009832 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009834 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009835 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009836 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837}
9838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839static Py_ssize_t
9840do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9841 Py_ssize_t i, k = 0;
9842
9843 for (i = 0; i < length; i++) {
9844 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9845 int n_res, j;
9846 if (Py_UNICODE_ISUPPER(c)) {
9847 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9848 }
9849 else if (Py_UNICODE_ISLOWER(c)) {
9850 n_res = _PyUnicode_ToUpperFull(c, mapped);
9851 }
9852 else {
9853 n_res = 1;
9854 mapped[0] = c;
9855 }
9856 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009857 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009858 res[k++] = mapped[j];
9859 }
9860 }
9861 return k;
9862}
9863
9864static Py_ssize_t
9865do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9866 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868 Py_ssize_t i, k = 0;
9869
9870 for (i = 0; i < length; i++) {
9871 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9872 int n_res, j;
9873 if (lower)
9874 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9875 else
9876 n_res = _PyUnicode_ToUpperFull(c, mapped);
9877 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009878 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879 res[k++] = mapped[j];
9880 }
9881 }
9882 return k;
9883}
9884
9885static Py_ssize_t
9886do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887{
9888 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9889}
9890
9891static Py_ssize_t
9892do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9893{
9894 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9895}
9896
Benjamin Petersone51757f2012-01-12 21:10:29 -05009897static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009898do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9899{
9900 Py_ssize_t i, k = 0;
9901
9902 for (i = 0; i < length; i++) {
9903 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9904 Py_UCS4 mapped[3];
9905 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9906 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009907 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009908 res[k++] = mapped[j];
9909 }
9910 }
9911 return k;
9912}
9913
9914static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009915do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9916{
9917 Py_ssize_t i, k = 0;
9918 int previous_is_cased;
9919
9920 previous_is_cased = 0;
9921 for (i = 0; i < length; i++) {
9922 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9923 Py_UCS4 mapped[3];
9924 int n_res, j;
9925
9926 if (previous_is_cased)
9927 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9928 else
9929 n_res = _PyUnicode_ToTitleFull(c, mapped);
9930
9931 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009932 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009933 res[k++] = mapped[j];
9934 }
9935
9936 previous_is_cased = _PyUnicode_IsCased(c);
9937 }
9938 return k;
9939}
9940
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009941static PyObject *
9942case_operation(PyObject *self,
9943 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9944{
9945 PyObject *res = NULL;
9946 Py_ssize_t length, newlength = 0;
9947 int kind, outkind;
9948 void *data, *outdata;
9949 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9950
Benjamin Petersoneea48462012-01-16 14:28:50 -05009951 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009952
9953 kind = PyUnicode_KIND(self);
9954 data = PyUnicode_DATA(self);
9955 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009956 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009957 PyErr_SetString(PyExc_OverflowError, "string is too long");
9958 return NULL;
9959 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009960 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009961 if (tmp == NULL)
9962 return PyErr_NoMemory();
9963 newlength = perform(kind, data, length, tmp, &maxchar);
9964 res = PyUnicode_New(newlength, maxchar);
9965 if (res == NULL)
9966 goto leave;
9967 tmpend = tmp + newlength;
9968 outdata = PyUnicode_DATA(res);
9969 outkind = PyUnicode_KIND(res);
9970 switch (outkind) {
9971 case PyUnicode_1BYTE_KIND:
9972 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9973 break;
9974 case PyUnicode_2BYTE_KIND:
9975 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9976 break;
9977 case PyUnicode_4BYTE_KIND:
9978 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9979 break;
9980 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009981 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009982 }
9983 leave:
9984 PyMem_FREE(tmp);
9985 return res;
9986}
9987
Tim Peters8ce9f162004-08-27 01:49:32 +00009988PyObject *
9989PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009991 PyObject *res;
9992 PyObject *fseq;
9993 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009996 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009997 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009998 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009999 }
10000
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 /* NOTE: the following code can't call back into Python code,
10002 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010003 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010005 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010006 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010007 res = _PyUnicode_JoinArray(separator, items, seqlen);
10008 Py_DECREF(fseq);
10009 return res;
10010}
10011
10012PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010013_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010014{
10015 PyObject *res = NULL; /* the result */
10016 PyObject *sep = NULL;
10017 Py_ssize_t seplen;
10018 PyObject *item;
10019 Py_ssize_t sz, i, res_offset;
10020 Py_UCS4 maxchar;
10021 Py_UCS4 item_maxchar;
10022 int use_memcpy;
10023 unsigned char *res_data = NULL, *sep_data = NULL;
10024 PyObject *last_obj;
10025 unsigned int kind = 0;
10026
Tim Peters05eba1f2004-08-27 21:32:02 +000010027 /* If empty sequence, return u"". */
10028 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010029 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010030 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010031
Tim Peters05eba1f2004-08-27 21:32:02 +000010032 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010034 if (seqlen == 1) {
10035 if (PyUnicode_CheckExact(items[0])) {
10036 res = items[0];
10037 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010038 return res;
10039 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010041 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010042 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010043 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010044 /* Set up sep and seplen */
10045 if (separator == NULL) {
10046 /* fall back to a blank space separator */
10047 sep = PyUnicode_FromOrdinal(' ');
10048 if (!sep)
10049 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010051 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010052 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010053 else {
10054 if (!PyUnicode_Check(separator)) {
10055 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010056 "separator: expected str instance,"
10057 " %.80s found",
10058 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010059 goto onError;
10060 }
10061 if (PyUnicode_READY(separator))
10062 goto onError;
10063 sep = separator;
10064 seplen = PyUnicode_GET_LENGTH(separator);
10065 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10066 /* inc refcount to keep this code path symmetric with the
10067 above case of a blank separator */
10068 Py_INCREF(sep);
10069 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010071 }
10072
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010073 /* There are at least two things to join, or else we have a subclass
10074 * of str in the sequence.
10075 * Do a pre-pass to figure out the total amount of space we'll
10076 * need (sz), and see whether all argument are strings.
10077 */
10078 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010079#ifdef Py_DEBUG
10080 use_memcpy = 0;
10081#else
10082 use_memcpy = 1;
10083#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010084 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010085 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010086 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 if (!PyUnicode_Check(item)) {
10088 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010089 "sequence item %zd: expected str instance,"
10090 " %.80s found",
10091 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010092 goto onError;
10093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 if (PyUnicode_READY(item) == -1)
10095 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010096 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010098 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010099 if (i != 0) {
10100 add_sz += seplen;
10101 }
10102 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010103 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010105 goto onError;
10106 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010107 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010108 if (use_memcpy && last_obj != NULL) {
10109 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10110 use_memcpy = 0;
10111 }
10112 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010113 }
Tim Petersced69f82003-09-16 20:30:58 +000010114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010116 if (res == NULL)
10117 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010118
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010119 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010120#ifdef Py_DEBUG
10121 use_memcpy = 0;
10122#else
10123 if (use_memcpy) {
10124 res_data = PyUnicode_1BYTE_DATA(res);
10125 kind = PyUnicode_KIND(res);
10126 if (seplen != 0)
10127 sep_data = PyUnicode_1BYTE_DATA(sep);
10128 }
10129#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010130 if (use_memcpy) {
10131 for (i = 0; i < seqlen; ++i) {
10132 Py_ssize_t itemlen;
10133 item = items[i];
10134
10135 /* Copy item, and maybe the separator. */
10136 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010137 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010138 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010139 kind * seplen);
10140 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010141 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010142
10143 itemlen = PyUnicode_GET_LENGTH(item);
10144 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010145 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010146 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 kind * itemlen);
10148 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010149 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010150 }
10151 assert(res_data == PyUnicode_1BYTE_DATA(res)
10152 + kind * PyUnicode_GET_LENGTH(res));
10153 }
10154 else {
10155 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10156 Py_ssize_t itemlen;
10157 item = items[i];
10158
10159 /* Copy item, and maybe the separator. */
10160 if (i && seplen != 0) {
10161 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10162 res_offset += seplen;
10163 }
10164
10165 itemlen = PyUnicode_GET_LENGTH(item);
10166 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010167 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010168 res_offset += itemlen;
10169 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010170 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010171 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010172 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010175 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010180 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181 return NULL;
10182}
10183
Victor Stinnerd3f08822012-05-29 12:57:52 +020010184void
10185_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10186 Py_UCS4 fill_char)
10187{
10188 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010189 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010190 assert(PyUnicode_IS_READY(unicode));
10191 assert(unicode_modifiable(unicode));
10192 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10193 assert(start >= 0);
10194 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010195 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010196}
10197
Victor Stinner3fe55312012-01-04 00:33:50 +010010198Py_ssize_t
10199PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10200 Py_UCS4 fill_char)
10201{
10202 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010203
10204 if (!PyUnicode_Check(unicode)) {
10205 PyErr_BadInternalCall();
10206 return -1;
10207 }
10208 if (PyUnicode_READY(unicode) == -1)
10209 return -1;
10210 if (unicode_check_modifiable(unicode))
10211 return -1;
10212
Victor Stinnerd3f08822012-05-29 12:57:52 +020010213 if (start < 0) {
10214 PyErr_SetString(PyExc_IndexError, "string index out of range");
10215 return -1;
10216 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010217 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10218 PyErr_SetString(PyExc_ValueError,
10219 "fill character is bigger than "
10220 "the string maximum character");
10221 return -1;
10222 }
10223
10224 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10225 length = Py_MIN(maxlen, length);
10226 if (length <= 0)
10227 return 0;
10228
Victor Stinnerd3f08822012-05-29 12:57:52 +020010229 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010230 return length;
10231}
10232
Victor Stinner9310abb2011-10-05 00:59:23 +020010233static PyObject *
10234pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010235 Py_ssize_t left,
10236 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 PyObject *u;
10240 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010241 int kind;
10242 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
10244 if (left < 0)
10245 left = 0;
10246 if (right < 0)
10247 right = 0;
10248
Victor Stinnerc4b49542011-12-11 22:44:26 +010010249 if (left == 0 && right == 0)
10250 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10253 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010254 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10255 return NULL;
10256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010258 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010260 if (!u)
10261 return NULL;
10262
10263 kind = PyUnicode_KIND(u);
10264 data = PyUnicode_DATA(u);
10265 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010266 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010267 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010268 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010269 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010270 assert(_PyUnicode_CheckConsistency(u, 1));
10271 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272}
10273
Alexander Belopolsky40018472011-02-26 01:02:56 +000010274PyObject *
10275PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010279 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281
Benjamin Petersonead6b532011-12-20 17:23:42 -060010282 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 if (PyUnicode_IS_ASCII(string))
10285 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 PyUnicode_GET_LENGTH(string), keepends);
10288 else
10289 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 case PyUnicode_2BYTE_KIND:
10294 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 PyUnicode_GET_LENGTH(string), keepends);
10297 break;
10298 case PyUnicode_4BYTE_KIND:
10299 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 PyUnicode_GET_LENGTH(string), keepends);
10302 break;
10303 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010304 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Alexander Belopolsky40018472011-02-26 01:02:56 +000010309static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010310split(PyObject *self,
10311 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010312 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010314 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 void *buf1, *buf2;
10316 Py_ssize_t len1, len2;
10317 PyObject* out;
10318
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010320 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (PyUnicode_READY(self) == -1)
10323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010326 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 if (PyUnicode_IS_ASCII(self))
10329 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010331 PyUnicode_GET_LENGTH(self), maxcount
10332 );
10333 else
10334 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 case PyUnicode_2BYTE_KIND:
10339 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 case PyUnicode_4BYTE_KIND:
10344 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 PyUnicode_GET_LENGTH(self), maxcount
10347 );
10348 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010349 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 }
10351
10352 if (PyUnicode_READY(substring) == -1)
10353 return NULL;
10354
10355 kind1 = PyUnicode_KIND(self);
10356 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 len1 = PyUnicode_GET_LENGTH(self);
10358 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 if (kind1 < kind2 || len1 < len2) {
10360 out = PyList_New(1);
10361 if (out == NULL)
10362 return NULL;
10363 Py_INCREF(self);
10364 PyList_SET_ITEM(out, 0, self);
10365 return out;
10366 }
10367 buf1 = PyUnicode_DATA(self);
10368 buf2 = PyUnicode_DATA(substring);
10369 if (kind2 != kind1) {
10370 buf2 = _PyUnicode_AsKind(substring, kind1);
10371 if (!buf2)
10372 return NULL;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010375 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10378 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380 else
10381 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_2BYTE_KIND:
10385 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 case PyUnicode_4BYTE_KIND:
10389 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 default:
10393 out = NULL;
10394 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 PyMem_Free(buf2);
10397 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398}
10399
Alexander Belopolsky40018472011-02-26 01:02:56 +000010400static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010401rsplit(PyObject *self,
10402 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010404{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010405 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 void *buf1, *buf2;
10407 Py_ssize_t len1, len2;
10408 PyObject* out;
10409
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010410 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010411 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (PyUnicode_READY(self) == -1)
10414 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010417 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 if (PyUnicode_IS_ASCII(self))
10420 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010421 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422 PyUnicode_GET_LENGTH(self), maxcount
10423 );
10424 else
10425 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010426 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 PyUnicode_GET_LENGTH(self), maxcount
10428 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 case PyUnicode_2BYTE_KIND:
10430 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010431 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 PyUnicode_GET_LENGTH(self), maxcount
10433 );
10434 case PyUnicode_4BYTE_KIND:
10435 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 PyUnicode_GET_LENGTH(self), maxcount
10438 );
10439 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010440 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 }
10442
10443 if (PyUnicode_READY(substring) == -1)
10444 return NULL;
10445
10446 kind1 = PyUnicode_KIND(self);
10447 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 len1 = PyUnicode_GET_LENGTH(self);
10449 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010450 if (kind1 < kind2 || len1 < len2) {
10451 out = PyList_New(1);
10452 if (out == NULL)
10453 return NULL;
10454 Py_INCREF(self);
10455 PyList_SET_ITEM(out, 0, self);
10456 return out;
10457 }
10458 buf1 = PyUnicode_DATA(self);
10459 buf2 = PyUnicode_DATA(substring);
10460 if (kind2 != kind1) {
10461 buf2 = _PyUnicode_AsKind(substring, kind1);
10462 if (!buf2)
10463 return NULL;
10464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010466 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010468 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10469 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010470 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010471 else
10472 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010473 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 break;
10475 case PyUnicode_2BYTE_KIND:
10476 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 break;
10479 case PyUnicode_4BYTE_KIND:
10480 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 break;
10483 default:
10484 out = NULL;
10485 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010486 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 PyMem_Free(buf2);
10488 return out;
10489}
10490
10491static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010492anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10493 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010495 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010497 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10498 return asciilib_find(buf1, len1, buf2, len2, offset);
10499 else
10500 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 case PyUnicode_2BYTE_KIND:
10502 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10503 case PyUnicode_4BYTE_KIND:
10504 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10505 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010506 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507}
10508
10509static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010510anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10511 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010513 switch (kind) {
10514 case PyUnicode_1BYTE_KIND:
10515 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10516 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10517 else
10518 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10519 case PyUnicode_2BYTE_KIND:
10520 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10521 case PyUnicode_4BYTE_KIND:
10522 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10523 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010524 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010525}
10526
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527static void
10528replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10529 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10530{
10531 int kind = PyUnicode_KIND(u);
10532 void *data = PyUnicode_DATA(u);
10533 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10534 if (kind == PyUnicode_1BYTE_KIND) {
10535 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10536 (Py_UCS1 *)data + len,
10537 u1, u2, maxcount);
10538 }
10539 else if (kind == PyUnicode_2BYTE_KIND) {
10540 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10541 (Py_UCS2 *)data + len,
10542 u1, u2, maxcount);
10543 }
10544 else {
10545 assert(kind == PyUnicode_4BYTE_KIND);
10546 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10547 (Py_UCS4 *)data + len,
10548 u1, u2, maxcount);
10549 }
10550}
10551
Alexander Belopolsky40018472011-02-26 01:02:56 +000010552static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553replace(PyObject *self, PyObject *str1,
10554 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 PyObject *u;
10557 char *sbuf = PyUnicode_DATA(self);
10558 char *buf1 = PyUnicode_DATA(str1);
10559 char *buf2 = PyUnicode_DATA(str2);
10560 int srelease = 0, release1 = 0, release2 = 0;
10561 int skind = PyUnicode_KIND(self);
10562 int kind1 = PyUnicode_KIND(str1);
10563 int kind2 = PyUnicode_KIND(str2);
10564 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10565 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10566 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010568 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010570 if (slen < len1)
10571 goto nothing;
10572
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010575 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Victor Stinner59de0ee2011-10-07 10:01:28 +020010578 if (str1 == str2)
10579 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010582 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10583 if (maxchar < maxchar_str1)
10584 /* substring too wide to be present */
10585 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10587 /* Replacing str1 with str2 may cause a maxchar reduction in the
10588 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010589 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010590 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010595 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010599 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010600
Victor Stinner69ed0f42013-04-09 21:48:24 +020010601 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010602 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010603 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010605 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010609
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010610 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10611 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 }
10613 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 int rkind = skind;
10615 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010616 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (kind1 < rkind) {
10619 /* widen substring */
10620 buf1 = _PyUnicode_AsKind(str1, rkind);
10621 if (!buf1) goto error;
10622 release1 = 1;
10623 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010624 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 if (i < 0)
10626 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 if (rkind > kind2) {
10628 /* widen replacement */
10629 buf2 = _PyUnicode_AsKind(str2, rkind);
10630 if (!buf2) goto error;
10631 release2 = 1;
10632 }
10633 else if (rkind < kind2) {
10634 /* widen self and buf1 */
10635 rkind = kind2;
10636 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010637 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 sbuf = _PyUnicode_AsKind(self, rkind);
10639 if (!sbuf) goto error;
10640 srelease = 1;
10641 buf1 = _PyUnicode_AsKind(str1, rkind);
10642 if (!buf1) goto error;
10643 release1 = 1;
10644 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010645 u = PyUnicode_New(slen, maxchar);
10646 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010648 assert(PyUnicode_KIND(u) == rkind);
10649 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010650
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010651 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010652 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010657
10658 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010659 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010662 if (i == -1)
10663 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010670 }
10671 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010673 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 int rkind = skind;
10675 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 buf1 = _PyUnicode_AsKind(str1, rkind);
10680 if (!buf1) goto error;
10681 release1 = 1;
10682 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010683 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 if (n == 0)
10685 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 buf2 = _PyUnicode_AsKind(str2, rkind);
10689 if (!buf2) goto error;
10690 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010693 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 rkind = kind2;
10695 sbuf = _PyUnicode_AsKind(self, rkind);
10696 if (!sbuf) goto error;
10697 srelease = 1;
10698 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010699 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 buf1 = _PyUnicode_AsKind(str1, rkind);
10701 if (!buf1) goto error;
10702 release1 = 1;
10703 }
10704 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10705 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010706 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 PyErr_SetString(PyExc_OverflowError,
10708 "replace string is too long");
10709 goto error;
10710 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010711 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010713 _Py_INCREF_UNICODE_EMPTY();
10714 if (!unicode_empty)
10715 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010716 u = unicode_empty;
10717 goto done;
10718 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010719 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 PyErr_SetString(PyExc_OverflowError,
10721 "replace string is too long");
10722 goto error;
10723 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010724 u = PyUnicode_New(new_size, maxchar);
10725 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010727 assert(PyUnicode_KIND(u) == rkind);
10728 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 ires = i = 0;
10730 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010731 while (n-- > 0) {
10732 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010733 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010734 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010735 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010736 if (j == -1)
10737 break;
10738 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010739 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010740 memcpy(res + rkind * ires,
10741 sbuf + rkind * i,
10742 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010744 }
10745 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010747 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010749 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010755 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 memcpy(res + rkind * ires,
10757 sbuf + rkind * i,
10758 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010759 }
10760 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010761 /* interleave */
10762 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010763 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010767 if (--n <= 0)
10768 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010769 memcpy(res + rkind * ires,
10770 sbuf + rkind * i,
10771 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 ires++;
10773 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010774 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010775 memcpy(res + rkind * ires,
10776 sbuf + rkind * i,
10777 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010778 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010779 }
10780
10781 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010782 unicode_adjust_maxchar(&u);
10783 if (u == NULL)
10784 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010786
10787 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (srelease)
10789 PyMem_FREE(sbuf);
10790 if (release1)
10791 PyMem_FREE(buf1);
10792 if (release2)
10793 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010794 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010798 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (srelease)
10800 PyMem_FREE(sbuf);
10801 if (release1)
10802 PyMem_FREE(buf1);
10803 if (release2)
10804 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010805 return unicode_result_unchanged(self);
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 error:
10808 if (srelease && sbuf)
10809 PyMem_FREE(sbuf);
10810 if (release1 && buf1)
10811 PyMem_FREE(buf1);
10812 if (release2 && buf2)
10813 PyMem_FREE(buf2);
10814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815}
10816
10817/* --- Unicode Object Methods --------------------------------------------- */
10818
INADA Naoki3ae20562017-01-16 20:41:20 +090010819/*[clinic input]
10820str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
INADA Naoki3ae20562017-01-16 20:41:20 +090010822Return a version of the string where each word is titlecased.
10823
10824More specifically, words start with uppercased characters and all remaining
10825cased characters have lower case.
10826[clinic start generated code]*/
10827
10828static PyObject *
10829unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010830/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010832 if (PyUnicode_READY(self) == -1)
10833 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010834 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835}
10836
INADA Naoki3ae20562017-01-16 20:41:20 +090010837/*[clinic input]
10838str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
INADA Naoki3ae20562017-01-16 20:41:20 +090010840Return a capitalized version of the string.
10841
10842More specifically, make the first character have upper case and the rest lower
10843case.
10844[clinic start generated code]*/
10845
10846static PyObject *
10847unicode_capitalize_impl(PyObject *self)
10848/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010850 if (PyUnicode_READY(self) == -1)
10851 return NULL;
10852 if (PyUnicode_GET_LENGTH(self) == 0)
10853 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010854 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855}
10856
INADA Naoki3ae20562017-01-16 20:41:20 +090010857/*[clinic input]
10858str.casefold as unicode_casefold
10859
10860Return a version of the string suitable for caseless comparisons.
10861[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010862
10863static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010864unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010865/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010866{
10867 if (PyUnicode_READY(self) == -1)
10868 return NULL;
10869 if (PyUnicode_IS_ASCII(self))
10870 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010871 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010872}
10873
10874
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010875/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010876
10877static int
10878convert_uc(PyObject *obj, void *addr)
10879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010881
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010882 if (!PyUnicode_Check(obj)) {
10883 PyErr_Format(PyExc_TypeError,
10884 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010885 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 return 0;
10887 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010888 if (PyUnicode_READY(obj) < 0)
10889 return 0;
10890 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010891 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010892 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893 return 0;
10894 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010895 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010896 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010897}
10898
INADA Naoki3ae20562017-01-16 20:41:20 +090010899/*[clinic input]
10900str.center as unicode_center
10901
10902 width: Py_ssize_t
10903 fillchar: Py_UCS4 = ' '
10904 /
10905
10906Return a centered string of length width.
10907
10908Padding is done using the specified fill character (default is a space).
10909[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910
10911static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010912unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10913/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010915 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
Benjamin Petersonbac79492012-01-14 13:34:47 -050010917 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 return NULL;
10919
Victor Stinnerc4b49542011-12-11 22:44:26 +010010920 if (PyUnicode_GET_LENGTH(self) >= width)
10921 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
Victor Stinnerc4b49542011-12-11 22:44:26 +010010923 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 left = marg / 2 + (marg & width & 1);
10925
Victor Stinner9310abb2011-10-05 00:59:23 +020010926 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927}
10928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929/* This function assumes that str1 and str2 are readied by the caller. */
10930
Marc-André Lemburge5034372000-08-08 08:04:29 +000010931static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010932unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010933{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010934#define COMPARE(TYPE1, TYPE2) \
10935 do { \
10936 TYPE1* p1 = (TYPE1 *)data1; \
10937 TYPE2* p2 = (TYPE2 *)data2; \
10938 TYPE1* end = p1 + len; \
10939 Py_UCS4 c1, c2; \
10940 for (; p1 != end; p1++, p2++) { \
10941 c1 = *p1; \
10942 c2 = *p2; \
10943 if (c1 != c2) \
10944 return (c1 < c2) ? -1 : 1; \
10945 } \
10946 } \
10947 while (0)
10948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 int kind1, kind2;
10950 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 kind1 = PyUnicode_KIND(str1);
10954 kind2 = PyUnicode_KIND(str2);
10955 data1 = PyUnicode_DATA(str1);
10956 data2 = PyUnicode_DATA(str2);
10957 len1 = PyUnicode_GET_LENGTH(str1);
10958 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010959 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010960
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010961 switch(kind1) {
10962 case PyUnicode_1BYTE_KIND:
10963 {
10964 switch(kind2) {
10965 case PyUnicode_1BYTE_KIND:
10966 {
10967 int cmp = memcmp(data1, data2, len);
10968 /* normalize result of memcmp() into the range [-1; 1] */
10969 if (cmp < 0)
10970 return -1;
10971 if (cmp > 0)
10972 return 1;
10973 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010974 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010975 case PyUnicode_2BYTE_KIND:
10976 COMPARE(Py_UCS1, Py_UCS2);
10977 break;
10978 case PyUnicode_4BYTE_KIND:
10979 COMPARE(Py_UCS1, Py_UCS4);
10980 break;
10981 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010982 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010983 }
10984 break;
10985 }
10986 case PyUnicode_2BYTE_KIND:
10987 {
10988 switch(kind2) {
10989 case PyUnicode_1BYTE_KIND:
10990 COMPARE(Py_UCS2, Py_UCS1);
10991 break;
10992 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010993 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010994 COMPARE(Py_UCS2, Py_UCS2);
10995 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010996 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010997 case PyUnicode_4BYTE_KIND:
10998 COMPARE(Py_UCS2, Py_UCS4);
10999 break;
11000 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011001 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011002 }
11003 break;
11004 }
11005 case PyUnicode_4BYTE_KIND:
11006 {
11007 switch(kind2) {
11008 case PyUnicode_1BYTE_KIND:
11009 COMPARE(Py_UCS4, Py_UCS1);
11010 break;
11011 case PyUnicode_2BYTE_KIND:
11012 COMPARE(Py_UCS4, Py_UCS2);
11013 break;
11014 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011015 {
11016#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11017 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11018 /* normalize result of wmemcmp() into the range [-1; 1] */
11019 if (cmp < 0)
11020 return -1;
11021 if (cmp > 0)
11022 return 1;
11023#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011024 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011025#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011026 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011027 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011028 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011029 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011030 }
11031 break;
11032 }
11033 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011034 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011035 }
11036
Victor Stinner770e19e2012-10-04 22:59:45 +020011037 if (len1 == len2)
11038 return 0;
11039 if (len1 < len2)
11040 return -1;
11041 else
11042 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011043
11044#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011045}
11046
Benjamin Peterson621b4302016-09-09 13:54:34 -070011047static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011048unicode_compare_eq(PyObject *str1, PyObject *str2)
11049{
11050 int kind;
11051 void *data1, *data2;
11052 Py_ssize_t len;
11053 int cmp;
11054
Victor Stinnere5567ad2012-10-23 02:48:49 +020011055 len = PyUnicode_GET_LENGTH(str1);
11056 if (PyUnicode_GET_LENGTH(str2) != len)
11057 return 0;
11058 kind = PyUnicode_KIND(str1);
11059 if (PyUnicode_KIND(str2) != kind)
11060 return 0;
11061 data1 = PyUnicode_DATA(str1);
11062 data2 = PyUnicode_DATA(str2);
11063
11064 cmp = memcmp(data1, data2, len * kind);
11065 return (cmp == 0);
11066}
11067
11068
Alexander Belopolsky40018472011-02-26 01:02:56 +000011069int
11070PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11073 if (PyUnicode_READY(left) == -1 ||
11074 PyUnicode_READY(right) == -1)
11075 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011076
11077 /* a string is equal to itself */
11078 if (left == right)
11079 return 0;
11080
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011081 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011083 PyErr_Format(PyExc_TypeError,
11084 "Can't compare %.100s and %.100s",
11085 left->ob_type->tp_name,
11086 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 return -1;
11088}
11089
Martin v. Löwis5b222132007-06-10 09:51:05 +000011090int
11091PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 Py_ssize_t i;
11094 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011096 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097
Victor Stinner910337b2011-10-03 03:20:16 +020011098 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011099 if (!PyUnicode_IS_READY(uni)) {
11100 const wchar_t *ws = _PyUnicode_WSTR(uni);
11101 /* Compare Unicode string and source character set string */
11102 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11103 if (chr != ustr[i])
11104 return (chr < ustr[i]) ? -1 : 1;
11105 }
11106 /* This check keeps Python strings that end in '\0' from comparing equal
11107 to C strings identical up to that point. */
11108 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11109 return 1; /* uni is longer */
11110 if (ustr[i])
11111 return -1; /* str is longer */
11112 return 0;
11113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011115 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011116 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011117 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011118 size_t len, len2 = strlen(str);
11119 int cmp;
11120
11121 len = Py_MIN(len1, len2);
11122 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011123 if (cmp != 0) {
11124 if (cmp < 0)
11125 return -1;
11126 else
11127 return 1;
11128 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011129 if (len1 > len2)
11130 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011131 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011132 return -1; /* str is longer */
11133 return 0;
11134 }
11135 else {
11136 void *data = PyUnicode_DATA(uni);
11137 /* Compare Unicode string and source character set string */
11138 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011139 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011140 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11141 /* This check keeps Python strings that end in '\0' from comparing equal
11142 to C strings identical up to that point. */
11143 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11144 return 1; /* uni is longer */
11145 if (str[i])
11146 return -1; /* str is longer */
11147 return 0;
11148 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011149}
11150
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011151static int
11152non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11153{
11154 size_t i, len;
11155 const wchar_t *p;
11156 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11157 if (strlen(str) != len)
11158 return 0;
11159 p = _PyUnicode_WSTR(unicode);
11160 assert(p);
11161 for (i = 0; i < len; i++) {
11162 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011163 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011164 return 0;
11165 }
11166 return 1;
11167}
11168
11169int
11170_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11171{
11172 size_t len;
11173 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011174 assert(str);
11175#ifndef NDEBUG
11176 for (const char *p = str; *p; p++) {
11177 assert((unsigned char)*p < 128);
11178 }
11179#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011180 if (PyUnicode_READY(unicode) == -1) {
11181 /* Memory error or bad data */
11182 PyErr_Clear();
11183 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11184 }
11185 if (!PyUnicode_IS_ASCII(unicode))
11186 return 0;
11187 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11188 return strlen(str) == len &&
11189 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11190}
11191
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011192int
11193_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11194{
11195 PyObject *right_uni;
11196 Py_hash_t hash;
11197
11198 assert(_PyUnicode_CHECK(left));
11199 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011200#ifndef NDEBUG
11201 for (const char *p = right->string; *p; p++) {
11202 assert((unsigned char)*p < 128);
11203 }
11204#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011205
11206 if (PyUnicode_READY(left) == -1) {
11207 /* memory error or bad data */
11208 PyErr_Clear();
11209 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11210 }
11211
11212 if (!PyUnicode_IS_ASCII(left))
11213 return 0;
11214
11215 right_uni = _PyUnicode_FromId(right); /* borrowed */
11216 if (right_uni == NULL) {
11217 /* memory error or bad data */
11218 PyErr_Clear();
11219 return _PyUnicode_EqualToASCIIString(left, right->string);
11220 }
11221
11222 if (left == right_uni)
11223 return 1;
11224
11225 if (PyUnicode_CHECK_INTERNED(left))
11226 return 0;
11227
INADA Naoki7cc95f52018-01-28 02:07:09 +090011228 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011229 hash = _PyUnicode_HASH(left);
11230 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11231 return 0;
11232
11233 return unicode_compare_eq(left, right_uni);
11234}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011235
Alexander Belopolsky40018472011-02-26 01:02:56 +000011236PyObject *
11237PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011238{
11239 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011240
Victor Stinnere5567ad2012-10-23 02:48:49 +020011241 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11242 Py_RETURN_NOTIMPLEMENTED;
11243
11244 if (PyUnicode_READY(left) == -1 ||
11245 PyUnicode_READY(right) == -1)
11246 return NULL;
11247
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011248 if (left == right) {
11249 switch (op) {
11250 case Py_EQ:
11251 case Py_LE:
11252 case Py_GE:
11253 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011254 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011255 case Py_NE:
11256 case Py_LT:
11257 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011258 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011259 default:
11260 PyErr_BadArgument();
11261 return NULL;
11262 }
11263 }
11264 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011265 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011266 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011267 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011268 }
11269 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011270 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011271 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011272 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011273}
11274
Alexander Belopolsky40018472011-02-26 01:02:56 +000011275int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011276_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11277{
11278 return unicode_eq(aa, bb);
11279}
11280
11281int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011282PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011283{
Victor Stinner77282cb2013-04-14 19:22:47 +020011284 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 void *buf1, *buf2;
11286 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011287 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011288
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011289 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011291 "'in <string>' requires string as left operand, not %.100s",
11292 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011293 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011294 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011296 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 if (ensure_unicode(str) < 0)
11298 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 kind2 = PyUnicode_KIND(substr);
11302 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011303 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 len2 = PyUnicode_GET_LENGTH(substr);
11306 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011307 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011308 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011310 if (len2 == 1) {
11311 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11312 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011313 return result;
11314 }
11315 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011316 buf2 = _PyUnicode_AsKind(substr, kind1);
11317 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011318 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320
Victor Stinner77282cb2013-04-14 19:22:47 +020011321 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 case PyUnicode_1BYTE_KIND:
11323 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11324 break;
11325 case PyUnicode_2BYTE_KIND:
11326 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11327 break;
11328 case PyUnicode_4BYTE_KIND:
11329 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11330 break;
11331 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011332 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011334
Victor Stinner77282cb2013-04-14 19:22:47 +020011335 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 PyMem_Free(buf2);
11337
Guido van Rossum403d68b2000-03-13 15:55:09 +000011338 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011339}
11340
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341/* Concat to string or Unicode object giving a new Unicode object. */
11342
Alexander Belopolsky40018472011-02-26 01:02:56 +000011343PyObject *
11344PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011346 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011347 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011348 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011350 if (ensure_unicode(left) < 0)
11351 return NULL;
11352
11353 if (!PyUnicode_Check(right)) {
11354 PyErr_Format(PyExc_TypeError,
11355 "can only concatenate str (not \"%.200s\") to str",
11356 right->ob_type->tp_name);
11357 return NULL;
11358 }
11359 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
11362 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011363 if (left == unicode_empty)
11364 return PyUnicode_FromObject(right);
11365 if (right == unicode_empty)
11366 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 left_len = PyUnicode_GET_LENGTH(left);
11369 right_len = PyUnicode_GET_LENGTH(right);
11370 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011371 PyErr_SetString(PyExc_OverflowError,
11372 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011374 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011376
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11378 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011379 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382 result = PyUnicode_New(new_len, maxchar);
11383 if (result == NULL)
11384 return NULL;
11385 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11386 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11387 assert(_PyUnicode_CheckConsistency(result, 1));
11388 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
Walter Dörwald1ab83302007-05-18 17:15:44 +000011391void
Victor Stinner23e56682011-10-03 03:54:37 +020011392PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011393{
Victor Stinner23e56682011-10-03 03:54:37 +020011394 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011395 Py_UCS4 maxchar, maxchar2;
11396 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011397
11398 if (p_left == NULL) {
11399 if (!PyErr_Occurred())
11400 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401 return;
11402 }
Victor Stinner23e56682011-10-03 03:54:37 +020011403 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011404 if (right == NULL || left == NULL
11405 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011406 if (!PyErr_Occurred())
11407 PyErr_BadInternalCall();
11408 goto error;
11409 }
11410
Benjamin Petersonbac79492012-01-14 13:34:47 -050011411 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011412 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011413 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011414 goto error;
11415
Victor Stinner488fa492011-12-12 00:01:39 +010011416 /* Shortcuts */
11417 if (left == unicode_empty) {
11418 Py_DECREF(left);
11419 Py_INCREF(right);
11420 *p_left = right;
11421 return;
11422 }
11423 if (right == unicode_empty)
11424 return;
11425
11426 left_len = PyUnicode_GET_LENGTH(left);
11427 right_len = PyUnicode_GET_LENGTH(right);
11428 if (left_len > PY_SSIZE_T_MAX - right_len) {
11429 PyErr_SetString(PyExc_OverflowError,
11430 "strings are too large to concat");
11431 goto error;
11432 }
11433 new_len = left_len + right_len;
11434
11435 if (unicode_modifiable(left)
11436 && PyUnicode_CheckExact(right)
11437 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011438 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11439 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011440 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011441 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011442 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11443 {
11444 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011445 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011446 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011447
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011448 /* copy 'right' into the newly allocated area of 'left' */
11449 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011450 }
Victor Stinner488fa492011-12-12 00:01:39 +010011451 else {
11452 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11453 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011454 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011455
Victor Stinner488fa492011-12-12 00:01:39 +010011456 /* Concat the two Unicode strings */
11457 res = PyUnicode_New(new_len, maxchar);
11458 if (res == NULL)
11459 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011460 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11461 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011462 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011463 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011464 }
11465 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011466 return;
11467
11468error:
Victor Stinner488fa492011-12-12 00:01:39 +010011469 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011470}
11471
11472void
11473PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11474{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011475 PyUnicode_Append(pleft, right);
11476 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011477}
11478
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011479/*
11480Wraps stringlib_parse_args_finds() and additionally ensures that the
11481first argument is a unicode object.
11482*/
11483
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011484static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011485parse_args_finds_unicode(const char * function_name, PyObject *args,
11486 PyObject **substring,
11487 Py_ssize_t *start, Py_ssize_t *end)
11488{
11489 if(stringlib_parse_args_finds(function_name, args, substring,
11490 start, end)) {
11491 if (ensure_unicode(*substring) < 0)
11492 return 0;
11493 return 1;
11494 }
11495 return 0;
11496}
11497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011502string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011506unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011508 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011509 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011510 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011512 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 void *buf1, *buf2;
11514 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011516 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 kind1 = PyUnicode_KIND(self);
11520 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011521 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011522 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 len1 = PyUnicode_GET_LENGTH(self);
11525 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011527 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011528 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011529
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011530 buf1 = PyUnicode_DATA(self);
11531 buf2 = PyUnicode_DATA(substring);
11532 if (kind2 != kind1) {
11533 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011534 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011535 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011536 }
11537 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 case PyUnicode_1BYTE_KIND:
11539 iresult = ucs1lib_count(
11540 ((Py_UCS1*)buf1) + start, end - start,
11541 buf2, len2, PY_SSIZE_T_MAX
11542 );
11543 break;
11544 case PyUnicode_2BYTE_KIND:
11545 iresult = ucs2lib_count(
11546 ((Py_UCS2*)buf1) + start, end - start,
11547 buf2, len2, PY_SSIZE_T_MAX
11548 );
11549 break;
11550 case PyUnicode_4BYTE_KIND:
11551 iresult = ucs4lib_count(
11552 ((Py_UCS4*)buf1) + start, end - start,
11553 buf2, len2, PY_SSIZE_T_MAX
11554 );
11555 break;
11556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011557 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 }
11559
11560 result = PyLong_FromSsize_t(iresult);
11561
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011562 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 return result;
11566}
11567
INADA Naoki3ae20562017-01-16 20:41:20 +090011568/*[clinic input]
11569str.encode as unicode_encode
11570
11571 encoding: str(c_default="NULL") = 'utf-8'
11572 The encoding in which to encode the string.
11573 errors: str(c_default="NULL") = 'strict'
11574 The error handling scheme to use for encoding errors.
11575 The default is 'strict' meaning that encoding errors raise a
11576 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11577 'xmlcharrefreplace' as well as any other name registered with
11578 codecs.register_error that can handle UnicodeEncodeErrors.
11579
11580Encode the string using the codec registered for encoding.
11581[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011584unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011585/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011587 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011588}
11589
INADA Naoki3ae20562017-01-16 20:41:20 +090011590/*[clinic input]
11591str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
INADA Naoki3ae20562017-01-16 20:41:20 +090011593 tabsize: int = 8
11594
11595Return a copy where all tab characters are expanded using spaces.
11596
11597If tabsize is not given, a tab size of 8 characters is assumed.
11598[clinic start generated code]*/
11599
11600static PyObject *
11601unicode_expandtabs_impl(PyObject *self, int tabsize)
11602/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 Py_ssize_t i, j, line_pos, src_len, incr;
11605 Py_UCS4 ch;
11606 PyObject *u;
11607 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011608 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011609 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
Antoine Pitrou22425222011-10-04 19:10:51 +020011611 if (PyUnicode_READY(self) == -1)
11612 return NULL;
11613
Thomas Wouters7e474022000-07-16 12:04:32 +000011614 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011615 src_len = PyUnicode_GET_LENGTH(self);
11616 i = j = line_pos = 0;
11617 kind = PyUnicode_KIND(self);
11618 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011619 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 for (; i < src_len; i++) {
11621 ch = PyUnicode_READ(kind, src_data, i);
11622 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011623 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011625 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011627 goto overflow;
11628 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011630 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011634 goto overflow;
11635 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011637 if (ch == '\n' || ch == '\r')
11638 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011640 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011641 if (!found)
11642 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011643
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011645 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646 if (!u)
11647 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011648 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Antoine Pitroue71d5742011-10-04 15:55:09 +020011650 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
Antoine Pitroue71d5742011-10-04 15:55:09 +020011652 for (; i < src_len; i++) {
11653 ch = PyUnicode_READ(kind, src_data, i);
11654 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011656 incr = tabsize - (line_pos % tabsize);
11657 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011658 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011659 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011661 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011663 line_pos++;
11664 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011665 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011666 if (ch == '\n' || ch == '\r')
11667 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011669 }
11670 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011671 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011672
Antoine Pitroue71d5742011-10-04 15:55:09 +020011673 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011674 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676}
11677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011678PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680\n\
11681Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011682such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683arguments start and end are interpreted as in slice notation.\n\
11684\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011685Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686
11687static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011690 /* initialize variables to prevent gcc warning */
11691 PyObject *substring = NULL;
11692 Py_ssize_t start = 0;
11693 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011694 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011696 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011699 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011702 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (result == -2)
11705 return NULL;
11706
Christian Heimes217cfd12007-12-02 14:31:20 +000011707 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708}
11709
11710static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011711unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011713 void *data;
11714 enum PyUnicode_Kind kind;
11715 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011716
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011717 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011718 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011720 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011721 if (PyUnicode_READY(self) == -1) {
11722 return NULL;
11723 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011724 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11725 PyErr_SetString(PyExc_IndexError, "string index out of range");
11726 return NULL;
11727 }
11728 kind = PyUnicode_KIND(self);
11729 data = PyUnicode_DATA(self);
11730 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011731 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732}
11733
Guido van Rossumc2504932007-09-18 19:42:40 +000011734/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011735 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011736static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011737unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011739 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011740
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011741#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011742 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011743#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (_PyUnicode_HASH(self) != -1)
11745 return _PyUnicode_HASH(self);
11746 if (PyUnicode_READY(self) == -1)
11747 return -1;
animalizea1d14252019-01-02 20:16:06 +080011748
Christian Heimes985ecdc2013-11-20 11:46:18 +010011749 x = _Py_HashBytes(PyUnicode_DATA(self),
11750 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011752 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753}
11754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011755PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757\n\
oldkaa0735f2018-02-02 16:52:55 +080011758Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011759such that sub is contained within S[start:end]. Optional\n\
11760arguments start and end are interpreted as in slice notation.\n\
11761\n\
11762Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011767 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011768 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011769 PyObject *substring = NULL;
11770 Py_ssize_t start = 0;
11771 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011773 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011776 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011779 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (result == -2)
11782 return NULL;
11783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 if (result < 0) {
11785 PyErr_SetString(PyExc_ValueError, "substring not found");
11786 return NULL;
11787 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011788
Christian Heimes217cfd12007-12-02 14:31:20 +000011789 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790}
11791
INADA Naoki3ae20562017-01-16 20:41:20 +090011792/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011793str.isascii as unicode_isascii
11794
11795Return True if all characters in the string are ASCII, False otherwise.
11796
11797ASCII characters have code points in the range U+0000-U+007F.
11798Empty string is ASCII too.
11799[clinic start generated code]*/
11800
11801static PyObject *
11802unicode_isascii_impl(PyObject *self)
11803/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11804{
11805 if (PyUnicode_READY(self) == -1) {
11806 return NULL;
11807 }
11808 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11809}
11810
11811/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011812str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
INADA Naoki3ae20562017-01-16 20:41:20 +090011814Return True if the string is a lowercase string, False otherwise.
11815
11816A string is lowercase if all cased characters in the string are lowercase and
11817there is at least one cased character in the string.
11818[clinic start generated code]*/
11819
11820static PyObject *
11821unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011822/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 Py_ssize_t i, length;
11825 int kind;
11826 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 int cased;
11828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (PyUnicode_READY(self) == -1)
11830 return NULL;
11831 length = PyUnicode_GET_LENGTH(self);
11832 kind = PyUnicode_KIND(self);
11833 data = PyUnicode_DATA(self);
11834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (length == 1)
11837 return PyBool_FromLong(
11838 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011840 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011842 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011843
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 for (i = 0; i < length; i++) {
11846 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011847
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011849 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 else if (!cased && Py_UNICODE_ISLOWER(ch))
11851 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011853 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854}
11855
INADA Naoki3ae20562017-01-16 20:41:20 +090011856/*[clinic input]
11857str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
INADA Naoki3ae20562017-01-16 20:41:20 +090011859Return True if the string is an uppercase string, False otherwise.
11860
11861A string is uppercase if all cased characters in the string are uppercase and
11862there is at least one cased character in the string.
11863[clinic start generated code]*/
11864
11865static PyObject *
11866unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011867/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 Py_ssize_t i, length;
11870 int kind;
11871 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 int cased;
11873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (PyUnicode_READY(self) == -1)
11875 return NULL;
11876 length = PyUnicode_GET_LENGTH(self);
11877 kind = PyUnicode_KIND(self);
11878 data = PyUnicode_DATA(self);
11879
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (length == 1)
11882 return PyBool_FromLong(
11883 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011885 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011887 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 for (i = 0; i < length; i++) {
11891 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011892
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 else if (!cased && Py_UNICODE_ISUPPER(ch))
11896 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011898 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
INADA Naoki3ae20562017-01-16 20:41:20 +090011901/*[clinic input]
11902str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904Return True if the string is a title-cased string, False otherwise.
11905
11906In a title-cased string, upper- and title-case characters may only
11907follow uncased characters and lowercase characters only cased ones.
11908[clinic start generated code]*/
11909
11910static PyObject *
11911unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011912/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 int cased, previous_is_cased;
11918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (PyUnicode_READY(self) == -1)
11920 return NULL;
11921 length = PyUnicode_GET_LENGTH(self);
11922 kind = PyUnicode_KIND(self);
11923 data = PyUnicode_DATA(self);
11924
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (length == 1) {
11927 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11928 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11929 (Py_UNICODE_ISUPPER(ch) != 0));
11930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011935
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 cased = 0;
11937 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < length; i++) {
11939 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011940
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11942 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 previous_is_cased = 1;
11945 cased = 1;
11946 }
11947 else if (Py_UNICODE_ISLOWER(ch)) {
11948 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011949 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 previous_is_cased = 1;
11951 cased = 1;
11952 }
11953 else
11954 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011956 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957}
11958
INADA Naoki3ae20562017-01-16 20:41:20 +090011959/*[clinic input]
11960str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
INADA Naoki3ae20562017-01-16 20:41:20 +090011962Return True if the string is a whitespace string, False otherwise.
11963
11964A string is whitespace if all characters in the string are whitespace and there
11965is at least one character in the string.
11966[clinic start generated code]*/
11967
11968static PyObject *
11969unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011970/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 Py_ssize_t i, length;
11973 int kind;
11974 void *data;
11975
11976 if (PyUnicode_READY(self) == -1)
11977 return NULL;
11978 length = PyUnicode_GET_LENGTH(self);
11979 kind = PyUnicode_KIND(self);
11980 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (length == 1)
11984 return PyBool_FromLong(
11985 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011987 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011989 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 for (i = 0; i < length; i++) {
11992 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011993 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011994 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011996 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997}
11998
INADA Naoki3ae20562017-01-16 20:41:20 +090011999/*[clinic input]
12000str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001
INADA Naoki3ae20562017-01-16 20:41:20 +090012002Return True if the string is an alphabetic string, False otherwise.
12003
12004A string is alphabetic if all characters in the string are alphabetic and there
12005is at least one character in the string.
12006[clinic start generated code]*/
12007
12008static PyObject *
12009unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012010/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 Py_ssize_t i, length;
12013 int kind;
12014 void *data;
12015
12016 if (PyUnicode_READY(self) == -1)
12017 return NULL;
12018 length = PyUnicode_GET_LENGTH(self);
12019 kind = PyUnicode_KIND(self);
12020 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012021
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012022 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (length == 1)
12024 return PyBool_FromLong(
12025 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012026
12027 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012029 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 for (i = 0; i < length; i++) {
12032 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012033 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012034 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012035 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012036}
12037
INADA Naoki3ae20562017-01-16 20:41:20 +090012038/*[clinic input]
12039str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012040
INADA Naoki3ae20562017-01-16 20:41:20 +090012041Return True if the string is an alpha-numeric string, False otherwise.
12042
12043A string is alpha-numeric if all characters in the string are alpha-numeric and
12044there is at least one character in the string.
12045[clinic start generated code]*/
12046
12047static PyObject *
12048unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012049/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 int kind;
12052 void *data;
12053 Py_ssize_t len, i;
12054
12055 if (PyUnicode_READY(self) == -1)
12056 return NULL;
12057
12058 kind = PyUnicode_KIND(self);
12059 data = PyUnicode_DATA(self);
12060 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012061
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012062 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (len == 1) {
12064 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12065 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12066 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012067
12068 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 for (i = 0; i < len; i++) {
12073 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012074 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012075 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012076 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012077 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012078}
12079
INADA Naoki3ae20562017-01-16 20:41:20 +090012080/*[clinic input]
12081str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
INADA Naoki3ae20562017-01-16 20:41:20 +090012083Return True if the string is a decimal string, False otherwise.
12084
12085A string is a decimal string if all characters in the string are decimal and
12086there is at least one character in the string.
12087[clinic start generated code]*/
12088
12089static PyObject *
12090unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012091/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 Py_ssize_t i, length;
12094 int kind;
12095 void *data;
12096
12097 if (PyUnicode_READY(self) == -1)
12098 return NULL;
12099 length = PyUnicode_GET_LENGTH(self);
12100 kind = PyUnicode_KIND(self);
12101 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (length == 1)
12105 return PyBool_FromLong(
12106 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012108 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012110 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 for (i = 0; i < length; i++) {
12113 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012114 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
INADA Naoki3ae20562017-01-16 20:41:20 +090012119/*[clinic input]
12120str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
INADA Naoki3ae20562017-01-16 20:41:20 +090012122Return True if the string is a digit string, False otherwise.
12123
12124A string is a digit string if all characters in the string are digits and there
12125is at least one character in the string.
12126[clinic start generated code]*/
12127
12128static PyObject *
12129unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012130/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 Py_ssize_t i, length;
12133 int kind;
12134 void *data;
12135
12136 if (PyUnicode_READY(self) == -1)
12137 return NULL;
12138 length = PyUnicode_GET_LENGTH(self);
12139 kind = PyUnicode_KIND(self);
12140 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (length == 1) {
12144 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12145 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012148 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012150 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 for (i = 0; i < length; i++) {
12153 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012154 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157}
12158
INADA Naoki3ae20562017-01-16 20:41:20 +090012159/*[clinic input]
12160str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162Return True if the string is a numeric string, False otherwise.
12163
12164A string is numeric if all characters in the string are numeric and there is at
12165least one character in the string.
12166[clinic start generated code]*/
12167
12168static PyObject *
12169unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012170/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 Py_ssize_t i, length;
12173 int kind;
12174 void *data;
12175
12176 if (PyUnicode_READY(self) == -1)
12177 return NULL;
12178 length = PyUnicode_GET_LENGTH(self);
12179 kind = PyUnicode_KIND(self);
12180 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (length == 1)
12184 return PyBool_FromLong(
12185 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012187 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 for (i = 0; i < length; i++) {
12192 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012193 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196}
12197
Martin v. Löwis47383402007-08-15 07:32:56 +000012198int
12199PyUnicode_IsIdentifier(PyObject *self)
12200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 int kind;
12202 void *data;
12203 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012204 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 if (PyUnicode_READY(self) == -1) {
12207 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012208 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 }
12210
12211 /* Special case for empty strings */
12212 if (PyUnicode_GET_LENGTH(self) == 0)
12213 return 0;
12214 kind = PyUnicode_KIND(self);
12215 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012216
12217 /* PEP 3131 says that the first character must be in
12218 XID_Start and subsequent characters in XID_Continue,
12219 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012220 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012221 letters, digits, underscore). However, given the current
12222 definition of XID_Start and XID_Continue, it is sufficient
12223 to check just for these, except that _ must be allowed
12224 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012226 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012227 return 0;
12228
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012229 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012232 return 1;
12233}
12234
INADA Naoki3ae20562017-01-16 20:41:20 +090012235/*[clinic input]
12236str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012237
INADA Naoki3ae20562017-01-16 20:41:20 +090012238Return True if the string is a valid Python identifier, False otherwise.
12239
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012240Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012241such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012242[clinic start generated code]*/
12243
12244static PyObject *
12245unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012246/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012247{
12248 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12249}
12250
INADA Naoki3ae20562017-01-16 20:41:20 +090012251/*[clinic input]
12252str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012253
INADA Naoki3ae20562017-01-16 20:41:20 +090012254Return True if the string is printable, False otherwise.
12255
12256A string is printable if all of its characters are considered printable in
12257repr() or if it is empty.
12258[clinic start generated code]*/
12259
12260static PyObject *
12261unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012262/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 Py_ssize_t i, length;
12265 int kind;
12266 void *data;
12267
12268 if (PyUnicode_READY(self) == -1)
12269 return NULL;
12270 length = PyUnicode_GET_LENGTH(self);
12271 kind = PyUnicode_KIND(self);
12272 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012273
12274 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 if (length == 1)
12276 return PyBool_FromLong(
12277 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 for (i = 0; i < length; i++) {
12280 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012281 Py_RETURN_FALSE;
12282 }
12283 }
12284 Py_RETURN_TRUE;
12285}
12286
INADA Naoki3ae20562017-01-16 20:41:20 +090012287/*[clinic input]
12288str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289
INADA Naoki3ae20562017-01-16 20:41:20 +090012290 iterable: object
12291 /
12292
12293Concatenate any number of strings.
12294
Martin Panter91a88662017-01-24 00:30:06 +000012295The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012296The result is returned as a new string.
12297
12298Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12299[clinic start generated code]*/
12300
12301static PyObject *
12302unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012303/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304{
INADA Naoki3ae20562017-01-16 20:41:20 +090012305 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306}
12307
Martin v. Löwis18e16552006-02-15 17:27:45 +000012308static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012309unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (PyUnicode_READY(self) == -1)
12312 return -1;
12313 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314}
12315
INADA Naoki3ae20562017-01-16 20:41:20 +090012316/*[clinic input]
12317str.ljust as unicode_ljust
12318
12319 width: Py_ssize_t
12320 fillchar: Py_UCS4 = ' '
12321 /
12322
12323Return a left-justified string of length width.
12324
12325Padding is done using the specified fill character (default is a space).
12326[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327
12328static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012329unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12330/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012332 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
Victor Stinnerc4b49542011-12-11 22:44:26 +010012335 if (PyUnicode_GET_LENGTH(self) >= width)
12336 return unicode_result_unchanged(self);
12337
12338 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339}
12340
INADA Naoki3ae20562017-01-16 20:41:20 +090012341/*[clinic input]
12342str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
INADA Naoki3ae20562017-01-16 20:41:20 +090012344Return a copy of the string converted to lowercase.
12345[clinic start generated code]*/
12346
12347static PyObject *
12348unicode_lower_impl(PyObject *self)
12349/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012351 if (PyUnicode_READY(self) == -1)
12352 return NULL;
12353 if (PyUnicode_IS_ASCII(self))
12354 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012355 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356}
12357
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012358#define LEFTSTRIP 0
12359#define RIGHTSTRIP 1
12360#define BOTHSTRIP 2
12361
12362/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012363static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012364
INADA Naoki3ae20562017-01-16 20:41:20 +090012365#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012366
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012367/* externally visible for str.strip(unicode) */
12368PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012369_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 void *data;
12372 int kind;
12373 Py_ssize_t i, j, len;
12374 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012375 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12378 return NULL;
12379
12380 kind = PyUnicode_KIND(self);
12381 data = PyUnicode_DATA(self);
12382 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012383 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12385 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012386 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012387
Benjamin Peterson14339b62009-01-31 16:36:08 +000012388 i = 0;
12389 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012390 while (i < len) {
12391 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12392 if (!BLOOM(sepmask, ch))
12393 break;
12394 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12395 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 i++;
12397 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012399
Benjamin Peterson14339b62009-01-31 16:36:08 +000012400 j = len;
12401 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012402 j--;
12403 while (j >= i) {
12404 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12405 if (!BLOOM(sepmask, ch))
12406 break;
12407 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12408 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012410 }
12411
Benjamin Peterson29060642009-01-31 22:14:21 +000012412 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012414
Victor Stinner7931d9a2011-11-04 00:22:48 +010012415 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416}
12417
12418PyObject*
12419PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12420{
12421 unsigned char *data;
12422 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012423 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424
Victor Stinnerde636f32011-10-01 03:55:54 +020012425 if (PyUnicode_READY(self) == -1)
12426 return NULL;
12427
Victor Stinner684d5fd2012-05-03 02:32:34 +020012428 length = PyUnicode_GET_LENGTH(self);
12429 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012430
Victor Stinner684d5fd2012-05-03 02:32:34 +020012431 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012432 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433
Victor Stinnerde636f32011-10-01 03:55:54 +020012434 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012435 PyErr_SetString(PyExc_IndexError, "string index out of range");
12436 return NULL;
12437 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012438 if (start >= length || end < start)
12439 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012440
Victor Stinner684d5fd2012-05-03 02:32:34 +020012441 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012442 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012443 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012444 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012445 }
12446 else {
12447 kind = PyUnicode_KIND(self);
12448 data = PyUnicode_1BYTE_DATA(self);
12449 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012450 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012451 length);
12452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
12455static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012456do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 Py_ssize_t len, i, j;
12459
12460 if (PyUnicode_READY(self) == -1)
12461 return NULL;
12462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464
Victor Stinnercc7af722013-04-09 22:39:24 +020012465 if (PyUnicode_IS_ASCII(self)) {
12466 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12467
12468 i = 0;
12469 if (striptype != RIGHTSTRIP) {
12470 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012471 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012472 if (!_Py_ascii_whitespace[ch])
12473 break;
12474 i++;
12475 }
12476 }
12477
12478 j = len;
12479 if (striptype != LEFTSTRIP) {
12480 j--;
12481 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012482 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012483 if (!_Py_ascii_whitespace[ch])
12484 break;
12485 j--;
12486 }
12487 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012488 }
12489 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012490 else {
12491 int kind = PyUnicode_KIND(self);
12492 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012493
Victor Stinnercc7af722013-04-09 22:39:24 +020012494 i = 0;
12495 if (striptype != RIGHTSTRIP) {
12496 while (i < len) {
12497 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12498 if (!Py_UNICODE_ISSPACE(ch))
12499 break;
12500 i++;
12501 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012502 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012503
12504 j = len;
12505 if (striptype != LEFTSTRIP) {
12506 j--;
12507 while (j >= i) {
12508 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12509 if (!Py_UNICODE_ISSPACE(ch))
12510 break;
12511 j--;
12512 }
12513 j++;
12514 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012515 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012516
Victor Stinner7931d9a2011-11-04 00:22:48 +010012517 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518}
12519
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012520
12521static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012522do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012523{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012524 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012525 if (PyUnicode_Check(sep))
12526 return _PyUnicode_XStrip(self, striptype, sep);
12527 else {
12528 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "%s arg must be None or str",
12530 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012531 return NULL;
12532 }
12533 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012534
Benjamin Peterson14339b62009-01-31 16:36:08 +000012535 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012536}
12537
12538
INADA Naoki3ae20562017-01-16 20:41:20 +090012539/*[clinic input]
12540str.strip as unicode_strip
12541
12542 chars: object = None
12543 /
12544
Zachary Ware09895c22019-10-09 16:09:00 -050012545Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012546
12547If chars is given and not None, remove characters in chars instead.
12548[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012549
12550static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012551unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012552/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012553{
INADA Naoki3ae20562017-01-16 20:41:20 +090012554 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012555}
12556
12557
INADA Naoki3ae20562017-01-16 20:41:20 +090012558/*[clinic input]
12559str.lstrip as unicode_lstrip
12560
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012561 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012562 /
12563
12564Return a copy of the string with leading whitespace removed.
12565
12566If chars is given and not None, remove characters in chars instead.
12567[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012568
12569static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012570unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012571/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012572{
INADA Naoki3ae20562017-01-16 20:41:20 +090012573 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012574}
12575
12576
INADA Naoki3ae20562017-01-16 20:41:20 +090012577/*[clinic input]
12578str.rstrip as unicode_rstrip
12579
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012580 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012581 /
12582
12583Return a copy of the string with trailing whitespace removed.
12584
12585If chars is given and not None, remove characters in chars instead.
12586[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012587
12588static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012589unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012590/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012591{
INADA Naoki3ae20562017-01-16 20:41:20 +090012592 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012593}
12594
12595
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012597unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012599 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601
Serhiy Storchaka05997252013-01-26 12:14:02 +020012602 if (len < 1)
12603 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604
Victor Stinnerc4b49542011-12-11 22:44:26 +010012605 /* no repeat, return original string */
12606 if (len == 1)
12607 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012608
Benjamin Petersonbac79492012-01-14 13:34:47 -050012609 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 return NULL;
12611
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012612 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012613 PyErr_SetString(PyExc_OverflowError,
12614 "repeated string is too long");
12615 return NULL;
12616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012618
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012619 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620 if (!u)
12621 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012622 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 if (PyUnicode_GET_LENGTH(str) == 1) {
12625 const int kind = PyUnicode_KIND(str);
12626 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012627 if (kind == PyUnicode_1BYTE_KIND) {
12628 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012629 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012630 }
12631 else if (kind == PyUnicode_2BYTE_KIND) {
12632 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012633 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012634 ucs2[n] = fill_char;
12635 } else {
12636 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12637 assert(kind == PyUnicode_4BYTE_KIND);
12638 for (n = 0; n < len; ++n)
12639 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 }
12642 else {
12643 /* number of characters copied this far */
12644 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012645 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012647 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012651 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012652 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654 }
12655
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012656 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658}
12659
Alexander Belopolsky40018472011-02-26 01:02:56 +000012660PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012661PyUnicode_Replace(PyObject *str,
12662 PyObject *substr,
12663 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012664 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012666 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12667 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012669 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670}
12671
INADA Naoki3ae20562017-01-16 20:41:20 +090012672/*[clinic input]
12673str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674
INADA Naoki3ae20562017-01-16 20:41:20 +090012675 old: unicode
12676 new: unicode
12677 count: Py_ssize_t = -1
12678 Maximum number of occurrences to replace.
12679 -1 (the default value) means replace all occurrences.
12680 /
12681
12682Return a copy with all occurrences of substring old replaced by new.
12683
12684If the optional argument count is given, only the first count occurrences are
12685replaced.
12686[clinic start generated code]*/
12687
12688static PyObject *
12689unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12690 Py_ssize_t count)
12691/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012693 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012695 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696}
12697
Alexander Belopolsky40018472011-02-26 01:02:56 +000012698static PyObject *
12699unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012701 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 Py_ssize_t isize;
12703 Py_ssize_t osize, squote, dquote, i, o;
12704 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012705 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012709 return NULL;
12710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 isize = PyUnicode_GET_LENGTH(unicode);
12712 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 /* Compute length of output, quote characters, and
12715 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012716 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 max = 127;
12718 squote = dquote = 0;
12719 ikind = PyUnicode_KIND(unicode);
12720 for (i = 0; i < isize; i++) {
12721 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012722 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012724 case '\'': squote++; break;
12725 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012727 incr = 2;
12728 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 default:
12730 /* Fast-path ASCII */
12731 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012732 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012734 ;
12735 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012738 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012740 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012742 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012744 if (osize > PY_SSIZE_T_MAX - incr) {
12745 PyErr_SetString(PyExc_OverflowError,
12746 "string is too long to generate repr");
12747 return NULL;
12748 }
12749 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 }
12751
12752 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012753 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012755 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 if (dquote)
12757 /* Both squote and dquote present. Use squote,
12758 and escape them */
12759 osize += squote;
12760 else
12761 quote = '"';
12762 }
Victor Stinner55c08782013-04-14 18:45:39 +020012763 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764
12765 repr = PyUnicode_New(osize, max);
12766 if (repr == NULL)
12767 return NULL;
12768 okind = PyUnicode_KIND(repr);
12769 odata = PyUnicode_DATA(repr);
12770
12771 PyUnicode_WRITE(okind, odata, 0, quote);
12772 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012773 if (unchanged) {
12774 _PyUnicode_FastCopyCharacters(repr, 1,
12775 unicode, 0,
12776 isize);
12777 }
12778 else {
12779 for (i = 0, o = 1; i < isize; i++) {
12780 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781
Victor Stinner55c08782013-04-14 18:45:39 +020012782 /* Escape quotes and backslashes */
12783 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012784 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012786 continue;
12787 }
12788
12789 /* Map special whitespace to '\t', \n', '\r' */
12790 if (ch == '\t') {
12791 PyUnicode_WRITE(okind, odata, o++, '\\');
12792 PyUnicode_WRITE(okind, odata, o++, 't');
12793 }
12794 else if (ch == '\n') {
12795 PyUnicode_WRITE(okind, odata, o++, '\\');
12796 PyUnicode_WRITE(okind, odata, o++, 'n');
12797 }
12798 else if (ch == '\r') {
12799 PyUnicode_WRITE(okind, odata, o++, '\\');
12800 PyUnicode_WRITE(okind, odata, o++, 'r');
12801 }
12802
12803 /* Map non-printable US ASCII to '\xhh' */
12804 else if (ch < ' ' || ch == 0x7F) {
12805 PyUnicode_WRITE(okind, odata, o++, '\\');
12806 PyUnicode_WRITE(okind, odata, o++, 'x');
12807 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12808 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12809 }
12810
12811 /* Copy ASCII characters as-is */
12812 else if (ch < 0x7F) {
12813 PyUnicode_WRITE(okind, odata, o++, ch);
12814 }
12815
12816 /* Non-ASCII characters */
12817 else {
12818 /* Map Unicode whitespace and control characters
12819 (categories Z* and C* except ASCII space)
12820 */
12821 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12822 PyUnicode_WRITE(okind, odata, o++, '\\');
12823 /* Map 8-bit characters to '\xhh' */
12824 if (ch <= 0xff) {
12825 PyUnicode_WRITE(okind, odata, o++, 'x');
12826 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12827 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12828 }
12829 /* Map 16-bit characters to '\uxxxx' */
12830 else if (ch <= 0xffff) {
12831 PyUnicode_WRITE(okind, odata, o++, 'u');
12832 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12833 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12834 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12836 }
12837 /* Map 21-bit characters to '\U00xxxxxx' */
12838 else {
12839 PyUnicode_WRITE(okind, odata, o++, 'U');
12840 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12841 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12842 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12843 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12844 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12845 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12846 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12847 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12848 }
12849 }
12850 /* Copy characters as-is */
12851 else {
12852 PyUnicode_WRITE(okind, odata, o++, ch);
12853 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012854 }
12855 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012858 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012859 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860}
12861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012862PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864\n\
12865Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012866such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867arguments start and end are interpreted as in slice notation.\n\
12868\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012869Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870
12871static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012874 /* initialize variables to prevent gcc warning */
12875 PyObject *substring = NULL;
12876 Py_ssize_t start = 0;
12877 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012878 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012880 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012881 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012883 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012886 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 if (result == -2)
12889 return NULL;
12890
Christian Heimes217cfd12007-12-02 14:31:20 +000012891 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892}
12893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012894PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012897Return the highest index in S where substring sub is found,\n\
12898such that sub is contained within S[start:end]. Optional\n\
12899arguments start and end are interpreted as in slice notation.\n\
12900\n\
12901Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902
12903static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012906 /* initialize variables to prevent gcc warning */
12907 PyObject *substring = NULL;
12908 Py_ssize_t start = 0;
12909 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012910 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012912 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012915 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012918 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 if (result == -2)
12921 return NULL;
12922
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923 if (result < 0) {
12924 PyErr_SetString(PyExc_ValueError, "substring not found");
12925 return NULL;
12926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927
Christian Heimes217cfd12007-12-02 14:31:20 +000012928 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012929}
12930
INADA Naoki3ae20562017-01-16 20:41:20 +090012931/*[clinic input]
12932str.rjust as unicode_rjust
12933
12934 width: Py_ssize_t
12935 fillchar: Py_UCS4 = ' '
12936 /
12937
12938Return a right-justified string of length width.
12939
12940Padding is done using the specified fill character (default is a space).
12941[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942
12943static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012944unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12945/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012947 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948 return NULL;
12949
Victor Stinnerc4b49542011-12-11 22:44:26 +010012950 if (PyUnicode_GET_LENGTH(self) >= width)
12951 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952
Victor Stinnerc4b49542011-12-11 22:44:26 +010012953 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954}
12955
Alexander Belopolsky40018472011-02-26 01:02:56 +000012956PyObject *
12957PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012959 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012962 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
INADA Naoki3ae20562017-01-16 20:41:20 +090012965/*[clinic input]
12966str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967
INADA Naoki3ae20562017-01-16 20:41:20 +090012968 sep: object = None
12969 The delimiter according which to split the string.
12970 None (the default value) means split according to any whitespace,
12971 and discard empty strings from the result.
12972 maxsplit: Py_ssize_t = -1
12973 Maximum number of splits to do.
12974 -1 (the default value) means no limit.
12975
12976Return a list of the words in the string, using sep as the delimiter string.
12977[clinic start generated code]*/
12978
12979static PyObject *
12980unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12981/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982{
INADA Naoki3ae20562017-01-16 20:41:20 +090012983 if (sep == Py_None)
12984 return split(self, NULL, maxsplit);
12985 if (PyUnicode_Check(sep))
12986 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012987
Victor Stinner998b8062018-09-12 00:23:25 +020012988 PyErr_Format(PyExc_TypeError,
12989 "must be str or None, not %.100s",
12990 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992}
12993
Thomas Wouters477c8d52006-05-27 19:21:47 +000012994PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012995PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012997 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012998 int kind1, kind2;
12999 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013001
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013002 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013004
Victor Stinner14f8f022011-10-05 20:58:25 +020013005 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 len1 = PyUnicode_GET_LENGTH(str_obj);
13008 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013009 if (kind1 < kind2 || len1 < len2) {
13010 _Py_INCREF_UNICODE_EMPTY();
13011 if (!unicode_empty)
13012 out = NULL;
13013 else {
13014 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13015 Py_DECREF(unicode_empty);
13016 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013017 return out;
13018 }
13019 buf1 = PyUnicode_DATA(str_obj);
13020 buf2 = PyUnicode_DATA(sep_obj);
13021 if (kind2 != kind1) {
13022 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13023 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013024 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013027 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013029 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13030 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13031 else
13032 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 break;
13034 case PyUnicode_2BYTE_KIND:
13035 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13036 break;
13037 case PyUnicode_4BYTE_KIND:
13038 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13039 break;
13040 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013041 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013043
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013044 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013046
13047 return out;
13048}
13049
13050
13051PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013052PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013053{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013055 int kind1, kind2;
13056 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013058
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013059 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013061
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013062 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 len1 = PyUnicode_GET_LENGTH(str_obj);
13065 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013066 if (kind1 < kind2 || len1 < len2) {
13067 _Py_INCREF_UNICODE_EMPTY();
13068 if (!unicode_empty)
13069 out = NULL;
13070 else {
13071 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13072 Py_DECREF(unicode_empty);
13073 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013074 return out;
13075 }
13076 buf1 = PyUnicode_DATA(str_obj);
13077 buf2 = PyUnicode_DATA(sep_obj);
13078 if (kind2 != kind1) {
13079 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13080 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013081 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013084 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013086 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13087 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13088 else
13089 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 break;
13091 case PyUnicode_2BYTE_KIND:
13092 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13093 break;
13094 case PyUnicode_4BYTE_KIND:
13095 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13096 break;
13097 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013098 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013100
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013101 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013103
13104 return out;
13105}
13106
INADA Naoki3ae20562017-01-16 20:41:20 +090013107/*[clinic input]
13108str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013109
INADA Naoki3ae20562017-01-16 20:41:20 +090013110 sep: object
13111 /
13112
13113Partition the string into three parts using the given separator.
13114
13115This will search for the separator in the string. If the separator is found,
13116returns a 3-tuple containing the part before the separator, the separator
13117itself, and the part after it.
13118
13119If the separator is not found, returns a 3-tuple containing the original string
13120and two empty strings.
13121[clinic start generated code]*/
13122
13123static PyObject *
13124unicode_partition(PyObject *self, PyObject *sep)
13125/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013126{
INADA Naoki3ae20562017-01-16 20:41:20 +090013127 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013128}
13129
INADA Naoki3ae20562017-01-16 20:41:20 +090013130/*[clinic input]
13131str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013132
INADA Naoki3ae20562017-01-16 20:41:20 +090013133Partition the string into three parts using the given separator.
13134
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013135This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013136the separator is found, returns a 3-tuple containing the part before the
13137separator, the separator itself, and the part after it.
13138
13139If the separator is not found, returns a 3-tuple containing two empty strings
13140and the original string.
13141[clinic start generated code]*/
13142
13143static PyObject *
13144unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013145/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013146{
INADA Naoki3ae20562017-01-16 20:41:20 +090013147 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013148}
13149
Alexander Belopolsky40018472011-02-26 01:02:56 +000013150PyObject *
13151PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013152{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013153 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013155
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013156 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013157}
13158
INADA Naoki3ae20562017-01-16 20:41:20 +090013159/*[clinic input]
13160str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013161
INADA Naoki3ae20562017-01-16 20:41:20 +090013162Return a list of the words in the string, using sep as the delimiter string.
13163
13164Splits are done starting at the end of the string and working to the front.
13165[clinic start generated code]*/
13166
13167static PyObject *
13168unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13169/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013170{
INADA Naoki3ae20562017-01-16 20:41:20 +090013171 if (sep == Py_None)
13172 return rsplit(self, NULL, maxsplit);
13173 if (PyUnicode_Check(sep))
13174 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013175
Victor Stinner998b8062018-09-12 00:23:25 +020013176 PyErr_Format(PyExc_TypeError,
13177 "must be str or None, not %.100s",
13178 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013179 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013180}
13181
INADA Naoki3ae20562017-01-16 20:41:20 +090013182/*[clinic input]
13183str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013185 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013186
13187Return a list of the lines in the string, breaking at line boundaries.
13188
13189Line breaks are not included in the resulting list unless keepends is given and
13190true.
13191[clinic start generated code]*/
13192
13193static PyObject *
13194unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013195/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013196{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013197 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198}
13199
13200static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013201PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013203 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204}
13205
INADA Naoki3ae20562017-01-16 20:41:20 +090013206/*[clinic input]
13207str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208
INADA Naoki3ae20562017-01-16 20:41:20 +090013209Convert uppercase characters to lowercase and lowercase characters to uppercase.
13210[clinic start generated code]*/
13211
13212static PyObject *
13213unicode_swapcase_impl(PyObject *self)
13214/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013216 if (PyUnicode_READY(self) == -1)
13217 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013218 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219}
13220
Larry Hastings61272b72014-01-07 12:41:53 -080013221/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013222
Larry Hastings31826802013-10-19 00:09:25 -070013223@staticmethod
13224str.maketrans as unicode_maketrans
13225
13226 x: object
13227
13228 y: unicode=NULL
13229
13230 z: unicode=NULL
13231
13232 /
13233
13234Return a translation table usable for str.translate().
13235
13236If there is only one argument, it must be a dictionary mapping Unicode
13237ordinals (integers) or characters to Unicode ordinals, strings or None.
13238Character keys will be then converted to ordinals.
13239If there are two arguments, they must be strings of equal length, and
13240in the resulting dictionary, each character in x will be mapped to the
13241character at the same position in y. If there is a third argument, it
13242must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013243[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013244
Larry Hastings31826802013-10-19 00:09:25 -070013245static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013246unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013247/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013248{
Georg Brandlceee0772007-11-27 23:48:05 +000013249 PyObject *new = NULL, *key, *value;
13250 Py_ssize_t i = 0;
13251 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013252
Georg Brandlceee0772007-11-27 23:48:05 +000013253 new = PyDict_New();
13254 if (!new)
13255 return NULL;
13256 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 int x_kind, y_kind, z_kind;
13258 void *x_data, *y_data, *z_data;
13259
Georg Brandlceee0772007-11-27 23:48:05 +000013260 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013261 if (!PyUnicode_Check(x)) {
13262 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13263 "be a string if there is a second argument");
13264 goto err;
13265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013267 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13268 "arguments must have equal length");
13269 goto err;
13270 }
13271 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 x_kind = PyUnicode_KIND(x);
13273 y_kind = PyUnicode_KIND(y);
13274 x_data = PyUnicode_DATA(x);
13275 y_data = PyUnicode_DATA(y);
13276 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13277 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013278 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013279 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013280 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013281 if (!value) {
13282 Py_DECREF(key);
13283 goto err;
13284 }
Georg Brandlceee0772007-11-27 23:48:05 +000013285 res = PyDict_SetItem(new, key, value);
13286 Py_DECREF(key);
13287 Py_DECREF(value);
13288 if (res < 0)
13289 goto err;
13290 }
13291 /* create entries for deleting chars in z */
13292 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 z_kind = PyUnicode_KIND(z);
13294 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013295 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013297 if (!key)
13298 goto err;
13299 res = PyDict_SetItem(new, key, Py_None);
13300 Py_DECREF(key);
13301 if (res < 0)
13302 goto err;
13303 }
13304 }
13305 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 int kind;
13307 void *data;
13308
Georg Brandlceee0772007-11-27 23:48:05 +000013309 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013310 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013311 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13312 "to maketrans it must be a dict");
13313 goto err;
13314 }
13315 /* copy entries into the new dict, converting string keys to int keys */
13316 while (PyDict_Next(x, &i, &key, &value)) {
13317 if (PyUnicode_Check(key)) {
13318 /* convert string keys to integer keys */
13319 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013320 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013321 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13322 "table must be of length 1");
13323 goto err;
13324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 kind = PyUnicode_KIND(key);
13326 data = PyUnicode_DATA(key);
13327 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013328 if (!newkey)
13329 goto err;
13330 res = PyDict_SetItem(new, newkey, value);
13331 Py_DECREF(newkey);
13332 if (res < 0)
13333 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013334 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013335 /* just keep integer keys */
13336 if (PyDict_SetItem(new, key, value) < 0)
13337 goto err;
13338 } else {
13339 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13340 "be strings or integers");
13341 goto err;
13342 }
13343 }
13344 }
13345 return new;
13346 err:
13347 Py_DECREF(new);
13348 return NULL;
13349}
13350
INADA Naoki3ae20562017-01-16 20:41:20 +090013351/*[clinic input]
13352str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353
INADA Naoki3ae20562017-01-16 20:41:20 +090013354 table: object
13355 Translation table, which must be a mapping of Unicode ordinals to
13356 Unicode ordinals, strings, or None.
13357 /
13358
13359Replace each character in the string using the given translation table.
13360
13361The table must implement lookup/indexing via __getitem__, for instance a
13362dictionary or list. If this operation raises LookupError, the character is
13363left untouched. Characters mapped to None are deleted.
13364[clinic start generated code]*/
13365
13366static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013368/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371}
13372
INADA Naoki3ae20562017-01-16 20:41:20 +090013373/*[clinic input]
13374str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375
INADA Naoki3ae20562017-01-16 20:41:20 +090013376Return a copy of the string converted to uppercase.
13377[clinic start generated code]*/
13378
13379static PyObject *
13380unicode_upper_impl(PyObject *self)
13381/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013383 if (PyUnicode_READY(self) == -1)
13384 return NULL;
13385 if (PyUnicode_IS_ASCII(self))
13386 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013387 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388}
13389
INADA Naoki3ae20562017-01-16 20:41:20 +090013390/*[clinic input]
13391str.zfill as unicode_zfill
13392
13393 width: Py_ssize_t
13394 /
13395
13396Pad a numeric string with zeros on the left, to fill a field of the given width.
13397
13398The string is never truncated.
13399[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013400
13401static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013402unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013403/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013405 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013406 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 int kind;
13408 void *data;
13409 Py_UCS4 chr;
13410
Benjamin Petersonbac79492012-01-14 13:34:47 -050013411 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413
Victor Stinnerc4b49542011-12-11 22:44:26 +010013414 if (PyUnicode_GET_LENGTH(self) >= width)
13415 return unicode_result_unchanged(self);
13416
13417 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418
13419 u = pad(self, fill, 0, '0');
13420
Walter Dörwald068325e2002-04-15 13:36:47 +000013421 if (u == NULL)
13422 return NULL;
13423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 kind = PyUnicode_KIND(u);
13425 data = PyUnicode_DATA(u);
13426 chr = PyUnicode_READ(kind, data, fill);
13427
13428 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013429 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013430 PyUnicode_WRITE(kind, data, 0, chr);
13431 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432 }
13433
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013434 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013435 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437
13438#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013439static PyObject *
13440unicode__decimal2ascii(PyObject *self)
13441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013443}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444#endif
13445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013446PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013449Return True if S starts with the specified prefix, False otherwise.\n\
13450With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451With optional end, stop comparing S at that position.\n\
13452prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453
13454static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013455unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013459 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013460 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013461 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013462 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463
Jesus Ceaac451502011-04-20 17:09:23 +020013464 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013466 if (PyTuple_Check(subobj)) {
13467 Py_ssize_t i;
13468 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013469 substring = PyTuple_GET_ITEM(subobj, i);
13470 if (!PyUnicode_Check(substring)) {
13471 PyErr_Format(PyExc_TypeError,
13472 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013473 "not %.100s",
13474 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013476 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013477 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013478 if (result == -1)
13479 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013480 if (result) {
13481 Py_RETURN_TRUE;
13482 }
13483 }
13484 /* nothing matched */
13485 Py_RETURN_FALSE;
13486 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013487 if (!PyUnicode_Check(subobj)) {
13488 PyErr_Format(PyExc_TypeError,
13489 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013490 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013491 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013492 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013493 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013494 if (result == -1)
13495 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013496 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013497}
13498
13499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013500PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013503Return True if S ends with the specified suffix, False otherwise.\n\
13504With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013505With optional end, stop comparing S at that position.\n\
13506suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013507
13508static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013509unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013511{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013512 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013513 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013514 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013515 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013516 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013517
Jesus Ceaac451502011-04-20 17:09:23 +020013518 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013520 if (PyTuple_Check(subobj)) {
13521 Py_ssize_t i;
13522 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013523 substring = PyTuple_GET_ITEM(subobj, i);
13524 if (!PyUnicode_Check(substring)) {
13525 PyErr_Format(PyExc_TypeError,
13526 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013527 "not %.100s",
13528 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013530 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013531 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013532 if (result == -1)
13533 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013534 if (result) {
13535 Py_RETURN_TRUE;
13536 }
13537 }
13538 Py_RETURN_FALSE;
13539 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013540 if (!PyUnicode_Check(subobj)) {
13541 PyErr_Format(PyExc_TypeError,
13542 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013543 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013545 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013546 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013547 if (result == -1)
13548 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013549 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013550}
13551
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013552static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013553_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013554{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013555 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13556 writer->data = PyUnicode_DATA(writer->buffer);
13557
13558 if (!writer->readonly) {
13559 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013560 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013561 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013562 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013563 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13564 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13565 writer->kind = PyUnicode_WCHAR_KIND;
13566 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13567
Victor Stinner8f674cc2013-04-17 23:02:17 +020013568 /* Copy-on-write mode: set buffer size to 0 so
13569 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13570 * next write. */
13571 writer->size = 0;
13572 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013573}
13574
Victor Stinnerd3f08822012-05-29 12:57:52 +020013575void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013576_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013577{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013578 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013579
13580 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013581 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013582
13583 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13584 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13585 writer->kind = PyUnicode_WCHAR_KIND;
13586 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013587}
13588
Inada Naoki770847a2019-06-24 12:30:24 +090013589// Initialize _PyUnicodeWriter with initial buffer
13590static inline void
13591_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13592{
13593 memset(writer, 0, sizeof(*writer));
13594 writer->buffer = buffer;
13595 _PyUnicodeWriter_Update(writer);
13596 writer->min_length = writer->size;
13597}
13598
Victor Stinnerd3f08822012-05-29 12:57:52 +020013599int
13600_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13601 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013602{
13603 Py_ssize_t newlen;
13604 PyObject *newbuffer;
13605
Victor Stinner2740e462016-09-06 16:58:36 -070013606 assert(maxchar <= MAX_UNICODE);
13607
Victor Stinnerca9381e2015-09-22 00:58:32 +020013608 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013609 assert((maxchar > writer->maxchar && length >= 0)
13610 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013611
Victor Stinner202fdca2012-05-07 12:47:02 +020013612 if (length > PY_SSIZE_T_MAX - writer->pos) {
13613 PyErr_NoMemory();
13614 return -1;
13615 }
13616 newlen = writer->pos + length;
13617
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013618 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013619
Victor Stinnerd3f08822012-05-29 12:57:52 +020013620 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013621 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013622 if (writer->overallocate
13623 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13624 /* overallocate to limit the number of realloc() */
13625 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013626 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013627 if (newlen < writer->min_length)
13628 newlen = writer->min_length;
13629
Victor Stinnerd3f08822012-05-29 12:57:52 +020013630 writer->buffer = PyUnicode_New(newlen, maxchar);
13631 if (writer->buffer == NULL)
13632 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013634 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013635 if (writer->overallocate
13636 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13637 /* overallocate to limit the number of realloc() */
13638 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013639 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013640 if (newlen < writer->min_length)
13641 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013642
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013643 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013644 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013645 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013646 newbuffer = PyUnicode_New(newlen, maxchar);
13647 if (newbuffer == NULL)
13648 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13650 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013651 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013652 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013653 }
13654 else {
13655 newbuffer = resize_compact(writer->buffer, newlen);
13656 if (newbuffer == NULL)
13657 return -1;
13658 }
13659 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013660 }
13661 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013662 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013663 newbuffer = PyUnicode_New(writer->size, maxchar);
13664 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013665 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013666 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13667 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013668 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013669 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013670 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013671 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013672
13673#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013674}
13675
Victor Stinnerca9381e2015-09-22 00:58:32 +020013676int
13677_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13678 enum PyUnicode_Kind kind)
13679{
13680 Py_UCS4 maxchar;
13681
13682 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13683 assert(writer->kind < kind);
13684
13685 switch (kind)
13686 {
13687 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13688 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13689 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13690 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013691 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013692 }
13693
13694 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13695}
13696
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013697static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013698_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013699{
Victor Stinner2740e462016-09-06 16:58:36 -070013700 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013701 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13702 return -1;
13703 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13704 writer->pos++;
13705 return 0;
13706}
13707
13708int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013709_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13710{
13711 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13712}
13713
13714int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013715_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13716{
13717 Py_UCS4 maxchar;
13718 Py_ssize_t len;
13719
13720 if (PyUnicode_READY(str) == -1)
13721 return -1;
13722 len = PyUnicode_GET_LENGTH(str);
13723 if (len == 0)
13724 return 0;
13725 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13726 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013727 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013728 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013729 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013730 Py_INCREF(str);
13731 writer->buffer = str;
13732 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013733 writer->pos += len;
13734 return 0;
13735 }
13736 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13737 return -1;
13738 }
13739 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13740 str, 0, len);
13741 writer->pos += len;
13742 return 0;
13743}
13744
Victor Stinnere215d962012-10-06 23:03:36 +020013745int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013746_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13747 Py_ssize_t start, Py_ssize_t end)
13748{
13749 Py_UCS4 maxchar;
13750 Py_ssize_t len;
13751
13752 if (PyUnicode_READY(str) == -1)
13753 return -1;
13754
13755 assert(0 <= start);
13756 assert(end <= PyUnicode_GET_LENGTH(str));
13757 assert(start <= end);
13758
13759 if (end == 0)
13760 return 0;
13761
13762 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13763 return _PyUnicodeWriter_WriteStr(writer, str);
13764
13765 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13766 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13767 else
13768 maxchar = writer->maxchar;
13769 len = end - start;
13770
13771 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13772 return -1;
13773
13774 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13775 str, start, len);
13776 writer->pos += len;
13777 return 0;
13778}
13779
13780int
Victor Stinner4a587072013-11-19 12:54:53 +010013781_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13782 const char *ascii, Py_ssize_t len)
13783{
13784 if (len == -1)
13785 len = strlen(ascii);
13786
13787 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13788
13789 if (writer->buffer == NULL && !writer->overallocate) {
13790 PyObject *str;
13791
13792 str = _PyUnicode_FromASCII(ascii, len);
13793 if (str == NULL)
13794 return -1;
13795
13796 writer->readonly = 1;
13797 writer->buffer = str;
13798 _PyUnicodeWriter_Update(writer);
13799 writer->pos += len;
13800 return 0;
13801 }
13802
13803 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13804 return -1;
13805
13806 switch (writer->kind)
13807 {
13808 case PyUnicode_1BYTE_KIND:
13809 {
13810 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13811 Py_UCS1 *data = writer->data;
13812
Christian Heimesf051e432016-09-13 20:22:02 +020013813 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013814 break;
13815 }
13816 case PyUnicode_2BYTE_KIND:
13817 {
13818 _PyUnicode_CONVERT_BYTES(
13819 Py_UCS1, Py_UCS2,
13820 ascii, ascii + len,
13821 (Py_UCS2 *)writer->data + writer->pos);
13822 break;
13823 }
13824 case PyUnicode_4BYTE_KIND:
13825 {
13826 _PyUnicode_CONVERT_BYTES(
13827 Py_UCS1, Py_UCS4,
13828 ascii, ascii + len,
13829 (Py_UCS4 *)writer->data + writer->pos);
13830 break;
13831 }
13832 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013833 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013834 }
13835
13836 writer->pos += len;
13837 return 0;
13838}
13839
13840int
13841_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13842 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013843{
13844 Py_UCS4 maxchar;
13845
13846 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13847 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13848 return -1;
13849 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13850 writer->pos += len;
13851 return 0;
13852}
13853
Victor Stinnerd3f08822012-05-29 12:57:52 +020013854PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013855_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013856{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013857 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013858
Victor Stinnerd3f08822012-05-29 12:57:52 +020013859 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013860 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013861 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013862 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013863
13864 str = writer->buffer;
13865 writer->buffer = NULL;
13866
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013867 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013868 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13869 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013870 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013871
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013872 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13873 PyObject *str2;
13874 str2 = resize_compact(str, writer->pos);
13875 if (str2 == NULL) {
13876 Py_DECREF(str);
13877 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013878 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013879 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013880 }
13881
Victor Stinner15a0bd32013-07-08 22:29:55 +020013882 assert(_PyUnicode_CheckConsistency(str, 1));
13883 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013884}
13885
Victor Stinnerd3f08822012-05-29 12:57:52 +020013886void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013887_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013888{
13889 Py_CLEAR(writer->buffer);
13890}
13891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013893
13894PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013896\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013897Return a formatted version of S, using substitutions from args and kwargs.\n\
13898The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013899
Eric Smith27bbca62010-11-04 17:06:58 +000013900PyDoc_STRVAR(format_map__doc__,
13901 "S.format_map(mapping) -> str\n\
13902\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013903Return a formatted version of S, using substitutions from mapping.\n\
13904The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013905
INADA Naoki3ae20562017-01-16 20:41:20 +090013906/*[clinic input]
13907str.__format__ as unicode___format__
13908
13909 format_spec: unicode
13910 /
13911
13912Return a formatted version of the string as described by format_spec.
13913[clinic start generated code]*/
13914
Eric Smith4a7d76d2008-05-30 18:10:19 +000013915static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013916unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013917/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013918{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013919 _PyUnicodeWriter writer;
13920 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013921
Victor Stinnerd3f08822012-05-29 12:57:52 +020013922 if (PyUnicode_READY(self) == -1)
13923 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013924 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013925 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13926 self, format_spec, 0,
13927 PyUnicode_GET_LENGTH(format_spec));
13928 if (ret == -1) {
13929 _PyUnicodeWriter_Dealloc(&writer);
13930 return NULL;
13931 }
13932 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013933}
13934
INADA Naoki3ae20562017-01-16 20:41:20 +090013935/*[clinic input]
13936str.__sizeof__ as unicode_sizeof
13937
13938Return the size of the string in memory, in bytes.
13939[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013940
13941static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013942unicode_sizeof_impl(PyObject *self)
13943/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 Py_ssize_t size;
13946
13947 /* If it's a compact object, account for base structure +
13948 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013949 if (PyUnicode_IS_COMPACT_ASCII(self))
13950 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13951 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013952 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013953 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 else {
13955 /* If it is a two-block object, account for base object, and
13956 for character block if present. */
13957 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013958 if (_PyUnicode_DATA_ANY(self))
13959 size += (PyUnicode_GET_LENGTH(self) + 1) *
13960 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013961 }
13962 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013963 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013964 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13965 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13966 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13967 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013968
13969 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013970}
13971
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013972static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013973unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013974{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013975 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013976 if (!copy)
13977 return NULL;
13978 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013979}
13980
Guido van Rossumd57fd912000-03-10 22:53:23 +000013981static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013982 UNICODE_ENCODE_METHODDEF
13983 UNICODE_REPLACE_METHODDEF
13984 UNICODE_SPLIT_METHODDEF
13985 UNICODE_RSPLIT_METHODDEF
13986 UNICODE_JOIN_METHODDEF
13987 UNICODE_CAPITALIZE_METHODDEF
13988 UNICODE_CASEFOLD_METHODDEF
13989 UNICODE_TITLE_METHODDEF
13990 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013991 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013992 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013993 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013994 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013995 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013996 UNICODE_LJUST_METHODDEF
13997 UNICODE_LOWER_METHODDEF
13998 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013999 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14000 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014001 UNICODE_RJUST_METHODDEF
14002 UNICODE_RSTRIP_METHODDEF
14003 UNICODE_RPARTITION_METHODDEF
14004 UNICODE_SPLITLINES_METHODDEF
14005 UNICODE_STRIP_METHODDEF
14006 UNICODE_SWAPCASE_METHODDEF
14007 UNICODE_TRANSLATE_METHODDEF
14008 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014009 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14010 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014011 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014012 UNICODE_ISLOWER_METHODDEF
14013 UNICODE_ISUPPER_METHODDEF
14014 UNICODE_ISTITLE_METHODDEF
14015 UNICODE_ISSPACE_METHODDEF
14016 UNICODE_ISDECIMAL_METHODDEF
14017 UNICODE_ISDIGIT_METHODDEF
14018 UNICODE_ISNUMERIC_METHODDEF
14019 UNICODE_ISALPHA_METHODDEF
14020 UNICODE_ISALNUM_METHODDEF
14021 UNICODE_ISIDENTIFIER_METHODDEF
14022 UNICODE_ISPRINTABLE_METHODDEF
14023 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014024 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014025 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014026 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014027 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014028 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014029#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014030 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014031 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032#endif
14033
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014034 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014035 {NULL, NULL}
14036};
14037
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014038static PyObject *
14039unicode_mod(PyObject *v, PyObject *w)
14040{
Brian Curtindfc80e32011-08-10 20:28:54 -050014041 if (!PyUnicode_Check(v))
14042 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014043 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014044}
14045
14046static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 0, /*nb_add*/
14048 0, /*nb_subtract*/
14049 0, /*nb_multiply*/
14050 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014051};
14052
Guido van Rossumd57fd912000-03-10 22:53:23 +000014053static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 (lenfunc) unicode_length, /* sq_length */
14055 PyUnicode_Concat, /* sq_concat */
14056 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14057 (ssizeargfunc) unicode_getitem, /* sq_item */
14058 0, /* sq_slice */
14059 0, /* sq_ass_item */
14060 0, /* sq_ass_slice */
14061 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062};
14063
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014064static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014065unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014067 if (PyUnicode_READY(self) == -1)
14068 return NULL;
14069
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014070 if (PyIndex_Check(item)) {
14071 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014072 if (i == -1 && PyErr_Occurred())
14073 return NULL;
14074 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014075 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014076 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014077 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014078 Py_ssize_t start, stop, step, slicelength, i;
14079 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014080 PyObject *result;
14081 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014082 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014083 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014084
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014085 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014086 return NULL;
14087 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014088 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14089 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014090
14091 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014092 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014093 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014094 slicelength == PyUnicode_GET_LENGTH(self)) {
14095 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014096 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014097 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014098 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014099 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014100 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014101 src_kind = PyUnicode_KIND(self);
14102 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014103 if (!PyUnicode_IS_ASCII(self)) {
14104 kind_limit = kind_maxchar_limit(src_kind);
14105 max_char = 0;
14106 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14107 ch = PyUnicode_READ(src_kind, src_data, cur);
14108 if (ch > max_char) {
14109 max_char = ch;
14110 if (max_char >= kind_limit)
14111 break;
14112 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014113 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014114 }
Victor Stinner55c99112011-10-13 01:17:06 +020014115 else
14116 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014117 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014118 if (result == NULL)
14119 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014120 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014121 dest_data = PyUnicode_DATA(result);
14122
14123 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014124 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14125 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014126 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014127 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014128 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014129 } else {
14130 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14131 return NULL;
14132 }
14133}
14134
14135static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 (lenfunc)unicode_length, /* mp_length */
14137 (binaryfunc)unicode_subscript, /* mp_subscript */
14138 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014139};
14140
Guido van Rossumd57fd912000-03-10 22:53:23 +000014141
Guido van Rossumd57fd912000-03-10 22:53:23 +000014142/* Helpers for PyUnicode_Format() */
14143
Victor Stinnera47082312012-10-04 02:19:54 +020014144struct unicode_formatter_t {
14145 PyObject *args;
14146 int args_owned;
14147 Py_ssize_t arglen, argidx;
14148 PyObject *dict;
14149
14150 enum PyUnicode_Kind fmtkind;
14151 Py_ssize_t fmtcnt, fmtpos;
14152 void *fmtdata;
14153 PyObject *fmtstr;
14154
14155 _PyUnicodeWriter writer;
14156};
14157
14158struct unicode_format_arg_t {
14159 Py_UCS4 ch;
14160 int flags;
14161 Py_ssize_t width;
14162 int prec;
14163 int sign;
14164};
14165
Guido van Rossumd57fd912000-03-10 22:53:23 +000014166static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014167unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014168{
Victor Stinnera47082312012-10-04 02:19:54 +020014169 Py_ssize_t argidx = ctx->argidx;
14170
14171 if (argidx < ctx->arglen) {
14172 ctx->argidx++;
14173 if (ctx->arglen < 0)
14174 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014175 else
Victor Stinnera47082312012-10-04 02:19:54 +020014176 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014177 }
14178 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014179 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014180 return NULL;
14181}
14182
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014183/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014184
Victor Stinnera47082312012-10-04 02:19:54 +020014185/* Format a float into the writer if the writer is not NULL, or into *p_output
14186 otherwise.
14187
14188 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014189static int
Victor Stinnera47082312012-10-04 02:19:54 +020014190formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14191 PyObject **p_output,
14192 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014193{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014194 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014196 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014197 int prec;
14198 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014199
Guido van Rossumd57fd912000-03-10 22:53:23 +000014200 x = PyFloat_AsDouble(v);
14201 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014202 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014203
Victor Stinnera47082312012-10-04 02:19:54 +020014204 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014205 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014206 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014207
Victor Stinnera47082312012-10-04 02:19:54 +020014208 if (arg->flags & F_ALT)
14209 dtoa_flags = Py_DTSF_ALT;
14210 else
14211 dtoa_flags = 0;
14212 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014213 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014214 return -1;
14215 len = strlen(p);
14216 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014217 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014218 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014219 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014220 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014221 }
14222 else
14223 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014224 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014225 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014226}
14227
Victor Stinnerd0880d52012-04-27 23:40:13 +020014228/* formatlong() emulates the format codes d, u, o, x and X, and
14229 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14230 * Python's regular ints.
14231 * Return value: a new PyUnicodeObject*, or NULL if error.
14232 * The output string is of the form
14233 * "-"? ("0x" | "0X")? digit+
14234 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14235 * set in flags. The case of hex digits will be correct,
14236 * There will be at least prec digits, zero-filled on the left if
14237 * necessary to get that many.
14238 * val object to be converted
14239 * flags bitmask of format flags; only F_ALT is looked at
14240 * prec minimum number of digits; 0-fill on left if needed
14241 * type a character in [duoxX]; u acts the same as d
14242 *
14243 * CAUTION: o, x and X conversions on regular ints can never
14244 * produce a '-' sign, but can for Python's unbounded ints.
14245 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014246PyObject *
14247_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014248{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014249 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014250 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014251 Py_ssize_t i;
14252 int sign; /* 1 if '-', else 0 */
14253 int len; /* number of characters */
14254 Py_ssize_t llen;
14255 int numdigits; /* len == numnondigits + numdigits */
14256 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014257
Victor Stinnerd0880d52012-04-27 23:40:13 +020014258 /* Avoid exceeding SSIZE_T_MAX */
14259 if (prec > INT_MAX-3) {
14260 PyErr_SetString(PyExc_OverflowError,
14261 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014262 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014263 }
14264
14265 assert(PyLong_Check(val));
14266
14267 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014268 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014269 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014270 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014271 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014272 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014273 /* int and int subclasses should print numerically when a numeric */
14274 /* format code is used (see issue18780) */
14275 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014276 break;
14277 case 'o':
14278 numnondigits = 2;
14279 result = PyNumber_ToBase(val, 8);
14280 break;
14281 case 'x':
14282 case 'X':
14283 numnondigits = 2;
14284 result = PyNumber_ToBase(val, 16);
14285 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014286 }
14287 if (!result)
14288 return NULL;
14289
14290 assert(unicode_modifiable(result));
14291 assert(PyUnicode_IS_READY(result));
14292 assert(PyUnicode_IS_ASCII(result));
14293
14294 /* To modify the string in-place, there can only be one reference. */
14295 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014296 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014297 PyErr_BadInternalCall();
14298 return NULL;
14299 }
14300 buf = PyUnicode_DATA(result);
14301 llen = PyUnicode_GET_LENGTH(result);
14302 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014303 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014304 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014305 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014306 return NULL;
14307 }
14308 len = (int)llen;
14309 sign = buf[0] == '-';
14310 numnondigits += sign;
14311 numdigits = len - numnondigits;
14312 assert(numdigits > 0);
14313
14314 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014315 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014316 (type == 'o' || type == 'x' || type == 'X'))) {
14317 assert(buf[sign] == '0');
14318 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14319 buf[sign+1] == 'o');
14320 numnondigits -= 2;
14321 buf += 2;
14322 len -= 2;
14323 if (sign)
14324 buf[0] = '-';
14325 assert(len == numnondigits + numdigits);
14326 assert(numdigits > 0);
14327 }
14328
14329 /* Fill with leading zeroes to meet minimum width. */
14330 if (prec > numdigits) {
14331 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14332 numnondigits + prec);
14333 char *b1;
14334 if (!r1) {
14335 Py_DECREF(result);
14336 return NULL;
14337 }
14338 b1 = PyBytes_AS_STRING(r1);
14339 for (i = 0; i < numnondigits; ++i)
14340 *b1++ = *buf++;
14341 for (i = 0; i < prec - numdigits; i++)
14342 *b1++ = '0';
14343 for (i = 0; i < numdigits; i++)
14344 *b1++ = *buf++;
14345 *b1 = '\0';
14346 Py_DECREF(result);
14347 result = r1;
14348 buf = PyBytes_AS_STRING(result);
14349 len = numnondigits + prec;
14350 }
14351
14352 /* Fix up case for hex conversions. */
14353 if (type == 'X') {
14354 /* Need to convert all lower case letters to upper case.
14355 and need to convert 0x to 0X (and -0x to -0X). */
14356 for (i = 0; i < len; i++)
14357 if (buf[i] >= 'a' && buf[i] <= 'x')
14358 buf[i] -= 'a'-'A';
14359 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014360 if (!PyUnicode_Check(result)
14361 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014362 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014363 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014364 Py_DECREF(result);
14365 result = unicode;
14366 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014367 else if (len != PyUnicode_GET_LENGTH(result)) {
14368 if (PyUnicode_Resize(&result, len) < 0)
14369 Py_CLEAR(result);
14370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014371 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014372}
14373
Ethan Furmandf3ed242014-01-05 06:50:30 -080014374/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014375 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014376 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014377 * -1 and raise an exception on error */
14378static int
Victor Stinnera47082312012-10-04 02:19:54 +020014379mainformatlong(PyObject *v,
14380 struct unicode_format_arg_t *arg,
14381 PyObject **p_output,
14382 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014383{
14384 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014385 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014386
14387 if (!PyNumber_Check(v))
14388 goto wrongtype;
14389
Ethan Furman9ab74802014-03-21 06:38:46 -070014390 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014391 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014392 if (type == 'o' || type == 'x' || type == 'X') {
14393 iobj = PyNumber_Index(v);
14394 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014395 if (PyErr_ExceptionMatches(PyExc_TypeError))
14396 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014397 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014398 }
14399 }
14400 else {
14401 iobj = PyNumber_Long(v);
14402 if (iobj == NULL ) {
14403 if (PyErr_ExceptionMatches(PyExc_TypeError))
14404 goto wrongtype;
14405 return -1;
14406 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014407 }
14408 assert(PyLong_Check(iobj));
14409 }
14410 else {
14411 iobj = v;
14412 Py_INCREF(iobj);
14413 }
14414
14415 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014416 && arg->width == -1 && arg->prec == -1
14417 && !(arg->flags & (F_SIGN | F_BLANK))
14418 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014419 {
14420 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014421 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014422 int base;
14423
Victor Stinnera47082312012-10-04 02:19:54 +020014424 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014425 {
14426 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014427 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014428 case 'd':
14429 case 'i':
14430 case 'u':
14431 base = 10;
14432 break;
14433 case 'o':
14434 base = 8;
14435 break;
14436 case 'x':
14437 case 'X':
14438 base = 16;
14439 break;
14440 }
14441
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014442 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14443 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014444 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014445 }
14446 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014447 return 1;
14448 }
14449
Ethan Furmanb95b5612015-01-23 20:05:18 -080014450 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014451 Py_DECREF(iobj);
14452 if (res == NULL)
14453 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014454 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014455 return 0;
14456
14457wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014458 switch(type)
14459 {
14460 case 'o':
14461 case 'x':
14462 case 'X':
14463 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014464 "%%%c format: an integer is required, "
14465 "not %.200s",
14466 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014467 break;
14468 default:
14469 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014470 "%%%c format: a number is required, "
14471 "not %.200s",
14472 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014473 break;
14474 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014475 return -1;
14476}
14477
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014478static Py_UCS4
14479formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014480{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014481 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014482 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014483 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014484 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014485 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014486 goto onError;
14487 }
14488 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014489 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014490 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014491 /* make sure number is a type of integer */
14492 if (!PyLong_Check(v)) {
14493 iobj = PyNumber_Index(v);
14494 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014495 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014496 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014497 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014498 Py_DECREF(iobj);
14499 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014500 else {
14501 x = PyLong_AsLong(v);
14502 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014503 if (x == -1 && PyErr_Occurred())
14504 goto onError;
14505
Victor Stinner8faf8212011-12-08 22:14:11 +010014506 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014507 PyErr_SetString(PyExc_OverflowError,
14508 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014509 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014510 }
14511
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014512 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014513 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014514
Benjamin Peterson29060642009-01-31 22:14:21 +000014515 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014516 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014517 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014518 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014519}
14520
Victor Stinnera47082312012-10-04 02:19:54 +020014521/* Parse options of an argument: flags, width, precision.
14522 Handle also "%(name)" syntax.
14523
14524 Return 0 if the argument has been formatted into arg->str.
14525 Return 1 if the argument has been written into ctx->writer,
14526 Raise an exception and return -1 on error. */
14527static int
14528unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14529 struct unicode_format_arg_t *arg)
14530{
14531#define FORMAT_READ(ctx) \
14532 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14533
14534 PyObject *v;
14535
Victor Stinnera47082312012-10-04 02:19:54 +020014536 if (arg->ch == '(') {
14537 /* Get argument value from a dictionary. Example: "%(name)s". */
14538 Py_ssize_t keystart;
14539 Py_ssize_t keylen;
14540 PyObject *key;
14541 int pcount = 1;
14542
14543 if (ctx->dict == NULL) {
14544 PyErr_SetString(PyExc_TypeError,
14545 "format requires a mapping");
14546 return -1;
14547 }
14548 ++ctx->fmtpos;
14549 --ctx->fmtcnt;
14550 keystart = ctx->fmtpos;
14551 /* Skip over balanced parentheses */
14552 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14553 arg->ch = FORMAT_READ(ctx);
14554 if (arg->ch == ')')
14555 --pcount;
14556 else if (arg->ch == '(')
14557 ++pcount;
14558 ctx->fmtpos++;
14559 }
14560 keylen = ctx->fmtpos - keystart - 1;
14561 if (ctx->fmtcnt < 0 || pcount > 0) {
14562 PyErr_SetString(PyExc_ValueError,
14563 "incomplete format key");
14564 return -1;
14565 }
14566 key = PyUnicode_Substring(ctx->fmtstr,
14567 keystart, keystart + keylen);
14568 if (key == NULL)
14569 return -1;
14570 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014571 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014572 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014573 }
14574 ctx->args = PyObject_GetItem(ctx->dict, key);
14575 Py_DECREF(key);
14576 if (ctx->args == NULL)
14577 return -1;
14578 ctx->args_owned = 1;
14579 ctx->arglen = -1;
14580 ctx->argidx = -2;
14581 }
14582
14583 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014584 while (--ctx->fmtcnt >= 0) {
14585 arg->ch = FORMAT_READ(ctx);
14586 ctx->fmtpos++;
14587 switch (arg->ch) {
14588 case '-': arg->flags |= F_LJUST; continue;
14589 case '+': arg->flags |= F_SIGN; continue;
14590 case ' ': arg->flags |= F_BLANK; continue;
14591 case '#': arg->flags |= F_ALT; continue;
14592 case '0': arg->flags |= F_ZERO; continue;
14593 }
14594 break;
14595 }
14596
14597 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014598 if (arg->ch == '*') {
14599 v = unicode_format_getnextarg(ctx);
14600 if (v == NULL)
14601 return -1;
14602 if (!PyLong_Check(v)) {
14603 PyErr_SetString(PyExc_TypeError,
14604 "* wants int");
14605 return -1;
14606 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014607 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014608 if (arg->width == -1 && PyErr_Occurred())
14609 return -1;
14610 if (arg->width < 0) {
14611 arg->flags |= F_LJUST;
14612 arg->width = -arg->width;
14613 }
14614 if (--ctx->fmtcnt >= 0) {
14615 arg->ch = FORMAT_READ(ctx);
14616 ctx->fmtpos++;
14617 }
14618 }
14619 else if (arg->ch >= '0' && arg->ch <= '9') {
14620 arg->width = arg->ch - '0';
14621 while (--ctx->fmtcnt >= 0) {
14622 arg->ch = FORMAT_READ(ctx);
14623 ctx->fmtpos++;
14624 if (arg->ch < '0' || arg->ch > '9')
14625 break;
14626 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14627 mixing signed and unsigned comparison. Since arg->ch is between
14628 '0' and '9', casting to int is safe. */
14629 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14630 PyErr_SetString(PyExc_ValueError,
14631 "width too big");
14632 return -1;
14633 }
14634 arg->width = arg->width*10 + (arg->ch - '0');
14635 }
14636 }
14637
14638 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014639 if (arg->ch == '.') {
14640 arg->prec = 0;
14641 if (--ctx->fmtcnt >= 0) {
14642 arg->ch = FORMAT_READ(ctx);
14643 ctx->fmtpos++;
14644 }
14645 if (arg->ch == '*') {
14646 v = unicode_format_getnextarg(ctx);
14647 if (v == NULL)
14648 return -1;
14649 if (!PyLong_Check(v)) {
14650 PyErr_SetString(PyExc_TypeError,
14651 "* wants int");
14652 return -1;
14653 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014654 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014655 if (arg->prec == -1 && PyErr_Occurred())
14656 return -1;
14657 if (arg->prec < 0)
14658 arg->prec = 0;
14659 if (--ctx->fmtcnt >= 0) {
14660 arg->ch = FORMAT_READ(ctx);
14661 ctx->fmtpos++;
14662 }
14663 }
14664 else if (arg->ch >= '0' && arg->ch <= '9') {
14665 arg->prec = arg->ch - '0';
14666 while (--ctx->fmtcnt >= 0) {
14667 arg->ch = FORMAT_READ(ctx);
14668 ctx->fmtpos++;
14669 if (arg->ch < '0' || arg->ch > '9')
14670 break;
14671 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14672 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014673 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014674 return -1;
14675 }
14676 arg->prec = arg->prec*10 + (arg->ch - '0');
14677 }
14678 }
14679 }
14680
14681 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14682 if (ctx->fmtcnt >= 0) {
14683 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14684 if (--ctx->fmtcnt >= 0) {
14685 arg->ch = FORMAT_READ(ctx);
14686 ctx->fmtpos++;
14687 }
14688 }
14689 }
14690 if (ctx->fmtcnt < 0) {
14691 PyErr_SetString(PyExc_ValueError,
14692 "incomplete format");
14693 return -1;
14694 }
14695 return 0;
14696
14697#undef FORMAT_READ
14698}
14699
14700/* Format one argument. Supported conversion specifiers:
14701
14702 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014703 - "i", "d", "u": int or float
14704 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014705 - "e", "E", "f", "F", "g", "G": float
14706 - "c": int or str (1 character)
14707
Victor Stinner8dbd4212012-12-04 09:30:24 +010014708 When possible, the output is written directly into the Unicode writer
14709 (ctx->writer). A string is created when padding is required.
14710
Victor Stinnera47082312012-10-04 02:19:54 +020014711 Return 0 if the argument has been formatted into *p_str,
14712 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014713 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014714static int
14715unicode_format_arg_format(struct unicode_formatter_t *ctx,
14716 struct unicode_format_arg_t *arg,
14717 PyObject **p_str)
14718{
14719 PyObject *v;
14720 _PyUnicodeWriter *writer = &ctx->writer;
14721
14722 if (ctx->fmtcnt == 0)
14723 ctx->writer.overallocate = 0;
14724
Victor Stinnera47082312012-10-04 02:19:54 +020014725 v = unicode_format_getnextarg(ctx);
14726 if (v == NULL)
14727 return -1;
14728
Victor Stinnera47082312012-10-04 02:19:54 +020014729
14730 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014731 case 's':
14732 case 'r':
14733 case 'a':
14734 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14735 /* Fast path */
14736 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14737 return -1;
14738 return 1;
14739 }
14740
14741 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14742 *p_str = v;
14743 Py_INCREF(*p_str);
14744 }
14745 else {
14746 if (arg->ch == 's')
14747 *p_str = PyObject_Str(v);
14748 else if (arg->ch == 'r')
14749 *p_str = PyObject_Repr(v);
14750 else
14751 *p_str = PyObject_ASCII(v);
14752 }
14753 break;
14754
14755 case 'i':
14756 case 'd':
14757 case 'u':
14758 case 'o':
14759 case 'x':
14760 case 'X':
14761 {
14762 int ret = mainformatlong(v, arg, p_str, writer);
14763 if (ret != 0)
14764 return ret;
14765 arg->sign = 1;
14766 break;
14767 }
14768
14769 case 'e':
14770 case 'E':
14771 case 'f':
14772 case 'F':
14773 case 'g':
14774 case 'G':
14775 if (arg->width == -1 && arg->prec == -1
14776 && !(arg->flags & (F_SIGN | F_BLANK)))
14777 {
14778 /* Fast path */
14779 if (formatfloat(v, arg, NULL, writer) == -1)
14780 return -1;
14781 return 1;
14782 }
14783
14784 arg->sign = 1;
14785 if (formatfloat(v, arg, p_str, NULL) == -1)
14786 return -1;
14787 break;
14788
14789 case 'c':
14790 {
14791 Py_UCS4 ch = formatchar(v);
14792 if (ch == (Py_UCS4) -1)
14793 return -1;
14794 if (arg->width == -1 && arg->prec == -1) {
14795 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014796 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014797 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014798 return 1;
14799 }
14800 *p_str = PyUnicode_FromOrdinal(ch);
14801 break;
14802 }
14803
14804 default:
14805 PyErr_Format(PyExc_ValueError,
14806 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014807 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014808 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14809 (int)arg->ch,
14810 ctx->fmtpos - 1);
14811 return -1;
14812 }
14813 if (*p_str == NULL)
14814 return -1;
14815 assert (PyUnicode_Check(*p_str));
14816 return 0;
14817}
14818
14819static int
14820unicode_format_arg_output(struct unicode_formatter_t *ctx,
14821 struct unicode_format_arg_t *arg,
14822 PyObject *str)
14823{
14824 Py_ssize_t len;
14825 enum PyUnicode_Kind kind;
14826 void *pbuf;
14827 Py_ssize_t pindex;
14828 Py_UCS4 signchar;
14829 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014830 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014831 Py_ssize_t sublen;
14832 _PyUnicodeWriter *writer = &ctx->writer;
14833 Py_UCS4 fill;
14834
14835 fill = ' ';
14836 if (arg->sign && arg->flags & F_ZERO)
14837 fill = '0';
14838
14839 if (PyUnicode_READY(str) == -1)
14840 return -1;
14841
14842 len = PyUnicode_GET_LENGTH(str);
14843 if ((arg->width == -1 || arg->width <= len)
14844 && (arg->prec == -1 || arg->prec >= len)
14845 && !(arg->flags & (F_SIGN | F_BLANK)))
14846 {
14847 /* Fast path */
14848 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14849 return -1;
14850 return 0;
14851 }
14852
14853 /* Truncate the string for "s", "r" and "a" formats
14854 if the precision is set */
14855 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14856 if (arg->prec >= 0 && len > arg->prec)
14857 len = arg->prec;
14858 }
14859
14860 /* Adjust sign and width */
14861 kind = PyUnicode_KIND(str);
14862 pbuf = PyUnicode_DATA(str);
14863 pindex = 0;
14864 signchar = '\0';
14865 if (arg->sign) {
14866 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14867 if (ch == '-' || ch == '+') {
14868 signchar = ch;
14869 len--;
14870 pindex++;
14871 }
14872 else if (arg->flags & F_SIGN)
14873 signchar = '+';
14874 else if (arg->flags & F_BLANK)
14875 signchar = ' ';
14876 else
14877 arg->sign = 0;
14878 }
14879 if (arg->width < len)
14880 arg->width = len;
14881
14882 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014883 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014884 if (!(arg->flags & F_LJUST)) {
14885 if (arg->sign) {
14886 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014887 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014888 }
14889 else {
14890 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014891 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014892 }
14893 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014894 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14895 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014896 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014897 }
14898
Victor Stinnera47082312012-10-04 02:19:54 +020014899 buflen = arg->width;
14900 if (arg->sign && len == arg->width)
14901 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014902 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014903 return -1;
14904
14905 /* Write the sign if needed */
14906 if (arg->sign) {
14907 if (fill != ' ') {
14908 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14909 writer->pos += 1;
14910 }
14911 if (arg->width > len)
14912 arg->width--;
14913 }
14914
14915 /* Write the numeric prefix for "x", "X" and "o" formats
14916 if the alternate form is used.
14917 For example, write "0x" for the "%#x" format. */
14918 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14919 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14920 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14921 if (fill != ' ') {
14922 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14923 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14924 writer->pos += 2;
14925 pindex += 2;
14926 }
14927 arg->width -= 2;
14928 if (arg->width < 0)
14929 arg->width = 0;
14930 len -= 2;
14931 }
14932
14933 /* Pad left with the fill character if needed */
14934 if (arg->width > len && !(arg->flags & F_LJUST)) {
14935 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014936 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014937 writer->pos += sublen;
14938 arg->width = len;
14939 }
14940
14941 /* If padding with spaces: write sign if needed and/or numeric prefix if
14942 the alternate form is used */
14943 if (fill == ' ') {
14944 if (arg->sign) {
14945 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14946 writer->pos += 1;
14947 }
14948 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14949 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14950 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14951 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14952 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14953 writer->pos += 2;
14954 pindex += 2;
14955 }
14956 }
14957
14958 /* Write characters */
14959 if (len) {
14960 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14961 str, pindex, len);
14962 writer->pos += len;
14963 }
14964
14965 /* Pad right with the fill character if needed */
14966 if (arg->width > len) {
14967 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014968 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014969 writer->pos += sublen;
14970 }
14971 return 0;
14972}
14973
14974/* Helper of PyUnicode_Format(): format one arg.
14975 Return 0 on success, raise an exception and return -1 on error. */
14976static int
14977unicode_format_arg(struct unicode_formatter_t *ctx)
14978{
14979 struct unicode_format_arg_t arg;
14980 PyObject *str;
14981 int ret;
14982
Victor Stinner8dbd4212012-12-04 09:30:24 +010014983 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014984 if (arg.ch == '%') {
14985 ctx->fmtpos++;
14986 ctx->fmtcnt--;
14987 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14988 return -1;
14989 return 0;
14990 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014991 arg.flags = 0;
14992 arg.width = -1;
14993 arg.prec = -1;
14994 arg.sign = 0;
14995 str = NULL;
14996
Victor Stinnera47082312012-10-04 02:19:54 +020014997 ret = unicode_format_arg_parse(ctx, &arg);
14998 if (ret == -1)
14999 return -1;
15000
15001 ret = unicode_format_arg_format(ctx, &arg, &str);
15002 if (ret == -1)
15003 return -1;
15004
15005 if (ret != 1) {
15006 ret = unicode_format_arg_output(ctx, &arg, str);
15007 Py_DECREF(str);
15008 if (ret == -1)
15009 return -1;
15010 }
15011
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015012 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015013 PyErr_SetString(PyExc_TypeError,
15014 "not all arguments converted during string formatting");
15015 return -1;
15016 }
15017 return 0;
15018}
15019
Alexander Belopolsky40018472011-02-26 01:02:56 +000015020PyObject *
15021PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015022{
Victor Stinnera47082312012-10-04 02:19:54 +020015023 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015024
Guido van Rossumd57fd912000-03-10 22:53:23 +000015025 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015026 PyErr_BadInternalCall();
15027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015028 }
Victor Stinnera47082312012-10-04 02:19:54 +020015029
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015030 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015031 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015032
15033 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015034 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15035 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15036 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15037 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015038
Victor Stinner8f674cc2013-04-17 23:02:17 +020015039 _PyUnicodeWriter_Init(&ctx.writer);
15040 ctx.writer.min_length = ctx.fmtcnt + 100;
15041 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015042
Guido van Rossumd57fd912000-03-10 22:53:23 +000015043 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015044 ctx.arglen = PyTuple_Size(args);
15045 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015046 }
15047 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015048 ctx.arglen = -1;
15049 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015050 }
Victor Stinnera47082312012-10-04 02:19:54 +020015051 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015052 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015053 ctx.dict = args;
15054 else
15055 ctx.dict = NULL;
15056 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015057
Victor Stinnera47082312012-10-04 02:19:54 +020015058 while (--ctx.fmtcnt >= 0) {
15059 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015060 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015061
15062 nonfmtpos = ctx.fmtpos++;
15063 while (ctx.fmtcnt >= 0 &&
15064 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15065 ctx.fmtpos++;
15066 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 }
Victor Stinnera47082312012-10-04 02:19:54 +020015068 if (ctx.fmtcnt < 0) {
15069 ctx.fmtpos--;
15070 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015071 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015072
Victor Stinnercfc4c132013-04-03 01:48:39 +020015073 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15074 nonfmtpos, ctx.fmtpos) < 0)
15075 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 }
15077 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015078 ctx.fmtpos++;
15079 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015080 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015081 }
15082 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015083
Victor Stinnera47082312012-10-04 02:19:54 +020015084 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015085 PyErr_SetString(PyExc_TypeError,
15086 "not all arguments converted during string formatting");
15087 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015088 }
15089
Victor Stinnera47082312012-10-04 02:19:54 +020015090 if (ctx.args_owned) {
15091 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015092 }
Victor Stinnera47082312012-10-04 02:19:54 +020015093 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015094
Benjamin Peterson29060642009-01-31 22:14:21 +000015095 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015096 _PyUnicodeWriter_Dealloc(&ctx.writer);
15097 if (ctx.args_owned) {
15098 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015099 }
15100 return NULL;
15101}
15102
Jeremy Hylton938ace62002-07-17 16:30:39 +000015103static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015104unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15105
Tim Peters6d6c1a32001-08-02 04:15:00 +000015106static PyObject *
15107unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15108{
Benjamin Peterson29060642009-01-31 22:14:21 +000015109 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015110 static char *kwlist[] = {"object", "encoding", "errors", 0};
15111 char *encoding = NULL;
15112 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015113
Benjamin Peterson14339b62009-01-31 16:36:08 +000015114 if (type != &PyUnicode_Type)
15115 return unicode_subtype_new(type, args, kwds);
15116 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015117 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015118 return NULL;
15119 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015120 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 if (encoding == NULL && errors == NULL)
15122 return PyObject_Str(x);
15123 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015124 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015125}
15126
Guido van Rossume023fe02001-08-30 03:12:59 +000015127static PyObject *
15128unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15129{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015130 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015131 Py_ssize_t length, char_size;
15132 int share_wstr, share_utf8;
15133 unsigned int kind;
15134 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015135
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015137
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015138 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015139 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015141 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015142 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015143 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015144 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015145 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015146
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015147 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015148 if (self == NULL) {
15149 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015150 return NULL;
15151 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015152 kind = PyUnicode_KIND(unicode);
15153 length = PyUnicode_GET_LENGTH(unicode);
15154
15155 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015156#ifdef Py_DEBUG
15157 _PyUnicode_HASH(self) = -1;
15158#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015159 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015160#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015161 _PyUnicode_STATE(self).interned = 0;
15162 _PyUnicode_STATE(self).kind = kind;
15163 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015164 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015165 _PyUnicode_STATE(self).ready = 1;
15166 _PyUnicode_WSTR(self) = NULL;
15167 _PyUnicode_UTF8_LENGTH(self) = 0;
15168 _PyUnicode_UTF8(self) = NULL;
15169 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015170 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015171
15172 share_utf8 = 0;
15173 share_wstr = 0;
15174 if (kind == PyUnicode_1BYTE_KIND) {
15175 char_size = 1;
15176 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15177 share_utf8 = 1;
15178 }
15179 else if (kind == PyUnicode_2BYTE_KIND) {
15180 char_size = 2;
15181 if (sizeof(wchar_t) == 2)
15182 share_wstr = 1;
15183 }
15184 else {
15185 assert(kind == PyUnicode_4BYTE_KIND);
15186 char_size = 4;
15187 if (sizeof(wchar_t) == 4)
15188 share_wstr = 1;
15189 }
15190
15191 /* Ensure we won't overflow the length. */
15192 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15193 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015194 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015196 data = PyObject_MALLOC((length + 1) * char_size);
15197 if (data == NULL) {
15198 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015199 goto onError;
15200 }
15201
Victor Stinnerc3c74152011-10-02 20:39:55 +020015202 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015203 if (share_utf8) {
15204 _PyUnicode_UTF8_LENGTH(self) = length;
15205 _PyUnicode_UTF8(self) = data;
15206 }
15207 if (share_wstr) {
15208 _PyUnicode_WSTR_LENGTH(self) = length;
15209 _PyUnicode_WSTR(self) = (wchar_t *)data;
15210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015211
Christian Heimesf051e432016-09-13 20:22:02 +020015212 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015213 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015214 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015215#ifdef Py_DEBUG
15216 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15217#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015218 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015219 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015220
15221onError:
15222 Py_DECREF(unicode);
15223 Py_DECREF(self);
15224 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015225}
15226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015227PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015228"str(object='') -> str\n\
15229str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015230\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015231Create a new string object from the given object. If encoding or\n\
15232errors is specified, then the object must expose a data buffer\n\
15233that will be decoded using the given encoding and error handler.\n\
15234Otherwise, returns the result of object.__str__() (if defined)\n\
15235or repr(object).\n\
15236encoding defaults to sys.getdefaultencoding().\n\
15237errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015238
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015239static PyObject *unicode_iter(PyObject *seq);
15240
Guido van Rossumd57fd912000-03-10 22:53:23 +000015241PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015242 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015243 "str", /* tp_name */
15244 sizeof(PyUnicodeObject), /* tp_basicsize */
15245 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015246 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015247 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015248 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015249 0, /* tp_getattr */
15250 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015251 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015252 unicode_repr, /* tp_repr */
15253 &unicode_as_number, /* tp_as_number */
15254 &unicode_as_sequence, /* tp_as_sequence */
15255 &unicode_as_mapping, /* tp_as_mapping */
15256 (hashfunc) unicode_hash, /* tp_hash*/
15257 0, /* tp_call*/
15258 (reprfunc) unicode_str, /* tp_str */
15259 PyObject_GenericGetAttr, /* tp_getattro */
15260 0, /* tp_setattro */
15261 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015263 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15264 unicode_doc, /* tp_doc */
15265 0, /* tp_traverse */
15266 0, /* tp_clear */
15267 PyUnicode_RichCompare, /* tp_richcompare */
15268 0, /* tp_weaklistoffset */
15269 unicode_iter, /* tp_iter */
15270 0, /* tp_iternext */
15271 unicode_methods, /* tp_methods */
15272 0, /* tp_members */
15273 0, /* tp_getset */
15274 &PyBaseObject_Type, /* tp_base */
15275 0, /* tp_dict */
15276 0, /* tp_descr_get */
15277 0, /* tp_descr_set */
15278 0, /* tp_dictoffset */
15279 0, /* tp_init */
15280 0, /* tp_alloc */
15281 unicode_new, /* tp_new */
15282 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015283};
15284
15285/* Initialize the Unicode implementation */
15286
Victor Stinner331a6a52019-05-27 16:39:22 +020015287PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015288_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015289{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015290 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015291 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015292 0x000A, /* LINE FEED */
15293 0x000D, /* CARRIAGE RETURN */
15294 0x001C, /* FILE SEPARATOR */
15295 0x001D, /* GROUP SEPARATOR */
15296 0x001E, /* RECORD SEPARATOR */
15297 0x0085, /* NEXT LINE */
15298 0x2028, /* LINE SEPARATOR */
15299 0x2029, /* PARAGRAPH SEPARATOR */
15300 };
15301
Fred Drakee4315f52000-05-09 19:53:39 +000015302 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015303 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015304 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015305 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015306 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015307 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015308
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015309 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015310 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015311 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015312
15313 /* initialize the linebreak bloom filter */
15314 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015315 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015316 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015317
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015318 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015319 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015320 }
15321 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015322 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015323 }
15324 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015325 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015326 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015327 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015328}
15329
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015330
Walter Dörwald16807132007-05-25 13:52:07 +000015331void
15332PyUnicode_InternInPlace(PyObject **p)
15333{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015334 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015336#ifdef Py_DEBUG
15337 assert(s != NULL);
15338 assert(_PyUnicode_CHECK(s));
15339#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015341 return;
15342#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 /* If it's a subclass, we don't really know what putting
15344 it in the interned dict might do. */
15345 if (!PyUnicode_CheckExact(s))
15346 return;
15347 if (PyUnicode_CHECK_INTERNED(s))
15348 return;
15349 if (interned == NULL) {
15350 interned = PyDict_New();
15351 if (interned == NULL) {
15352 PyErr_Clear(); /* Don't leave an exception */
15353 return;
15354 }
15355 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015357 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015359 if (t == NULL) {
15360 PyErr_Clear();
15361 return;
15362 }
15363 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015364 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015365 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015366 return;
15367 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015368 /* The two references in interned are not counted by refcnt.
15369 The deallocator will take care of this */
15370 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015371 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015372}
15373
15374void
15375PyUnicode_InternImmortal(PyObject **p)
15376{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 PyUnicode_InternInPlace(p);
15378 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015379 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015380 Py_INCREF(*p);
15381 }
Walter Dörwald16807132007-05-25 13:52:07 +000015382}
15383
15384PyObject *
15385PyUnicode_InternFromString(const char *cp)
15386{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 PyObject *s = PyUnicode_FromString(cp);
15388 if (s == NULL)
15389 return NULL;
15390 PyUnicode_InternInPlace(&s);
15391 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015392}
15393
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015394
15395#if defined(WITH_VALGRIND) || defined(__INSURE__)
15396static void
15397unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015398{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015399 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015401 }
15402 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015403 if (keys == NULL || !PyList_Check(keys)) {
15404 PyErr_Clear();
15405 return;
15406 }
Walter Dörwald16807132007-05-25 13:52:07 +000015407
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015408 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 detector, interned unicode strings are not forcibly deallocated;
15410 rather, we give them their stolen references back, and then clear
15411 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015412
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015413 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015414#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015415 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015416 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015417
15418 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015419#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015420 for (Py_ssize_t i = 0; i < n; i++) {
15421 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015422 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015423 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015425 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 case SSTATE_INTERNED_IMMORTAL:
15427 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015428#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015429 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015430#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015431 break;
15432 case SSTATE_INTERNED_MORTAL:
15433 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015434#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015435 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015436#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015438 case SSTATE_NOT_INTERNED:
15439 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015441 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015443 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015445#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 fprintf(stderr, "total size of all interned strings: "
15447 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15448 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015449#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 Py_DECREF(keys);
15451 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015452 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015453}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015454#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015455
15456
15457/********************* Unicode Iterator **************************/
15458
15459typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015460 PyObject_HEAD
15461 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015462 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015463} unicodeiterobject;
15464
15465static void
15466unicodeiter_dealloc(unicodeiterobject *it)
15467{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015468 _PyObject_GC_UNTRACK(it);
15469 Py_XDECREF(it->it_seq);
15470 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015471}
15472
15473static int
15474unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15475{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 Py_VISIT(it->it_seq);
15477 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015478}
15479
15480static PyObject *
15481unicodeiter_next(unicodeiterobject *it)
15482{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015483 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015484
Benjamin Peterson14339b62009-01-31 16:36:08 +000015485 assert(it != NULL);
15486 seq = it->it_seq;
15487 if (seq == NULL)
15488 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015489 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015491 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15492 int kind = PyUnicode_KIND(seq);
15493 void *data = PyUnicode_DATA(seq);
15494 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15495 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 if (item != NULL)
15497 ++it->it_index;
15498 return item;
15499 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015500
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015502 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015504}
15505
15506static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015507unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015508{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015509 Py_ssize_t len = 0;
15510 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015511 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015513}
15514
15515PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15516
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015517static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015518unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015519{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015520 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015521 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015522 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015523 it->it_seq, it->it_index);
15524 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015525 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015526 if (u == NULL)
15527 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015528 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015529 }
15530}
15531
15532PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15533
15534static PyObject *
15535unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15536{
15537 Py_ssize_t index = PyLong_AsSsize_t(state);
15538 if (index == -1 && PyErr_Occurred())
15539 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015540 if (it->it_seq != NULL) {
15541 if (index < 0)
15542 index = 0;
15543 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15544 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15545 it->it_index = index;
15546 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015547 Py_RETURN_NONE;
15548}
15549
15550PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15551
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015552static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015553 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015554 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015555 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15556 reduce_doc},
15557 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15558 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015559 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015560};
15561
15562PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015563 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15564 "str_iterator", /* tp_name */
15565 sizeof(unicodeiterobject), /* tp_basicsize */
15566 0, /* tp_itemsize */
15567 /* methods */
15568 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015569 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015570 0, /* tp_getattr */
15571 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015572 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015573 0, /* tp_repr */
15574 0, /* tp_as_number */
15575 0, /* tp_as_sequence */
15576 0, /* tp_as_mapping */
15577 0, /* tp_hash */
15578 0, /* tp_call */
15579 0, /* tp_str */
15580 PyObject_GenericGetAttr, /* tp_getattro */
15581 0, /* tp_setattro */
15582 0, /* tp_as_buffer */
15583 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15584 0, /* tp_doc */
15585 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15586 0, /* tp_clear */
15587 0, /* tp_richcompare */
15588 0, /* tp_weaklistoffset */
15589 PyObject_SelfIter, /* tp_iter */
15590 (iternextfunc)unicodeiter_next, /* tp_iternext */
15591 unicodeiter_methods, /* tp_methods */
15592 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015593};
15594
15595static PyObject *
15596unicode_iter(PyObject *seq)
15597{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015598 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015599
Benjamin Peterson14339b62009-01-31 16:36:08 +000015600 if (!PyUnicode_Check(seq)) {
15601 PyErr_BadInternalCall();
15602 return NULL;
15603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015604 if (PyUnicode_READY(seq) == -1)
15605 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015606 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15607 if (it == NULL)
15608 return NULL;
15609 it->it_index = 0;
15610 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015611 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015612 _PyObject_GC_TRACK(it);
15613 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015614}
15615
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015616
15617size_t
15618Py_UNICODE_strlen(const Py_UNICODE *u)
15619{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015620 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015621}
15622
15623Py_UNICODE*
15624Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15625{
15626 Py_UNICODE *u = s1;
15627 while ((*u++ = *s2++));
15628 return s1;
15629}
15630
15631Py_UNICODE*
15632Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15633{
15634 Py_UNICODE *u = s1;
15635 while ((*u++ = *s2++))
15636 if (n-- == 0)
15637 break;
15638 return s1;
15639}
15640
15641Py_UNICODE*
15642Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15643{
15644 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015645 u1 += wcslen(u1);
15646 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015647 return s1;
15648}
15649
15650int
15651Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15652{
15653 while (*s1 && *s2 && *s1 == *s2)
15654 s1++, s2++;
15655 if (*s1 && *s2)
15656 return (*s1 < *s2) ? -1 : +1;
15657 if (*s1)
15658 return 1;
15659 if (*s2)
15660 return -1;
15661 return 0;
15662}
15663
15664int
15665Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15666{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015667 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015668 for (; n != 0; n--) {
15669 u1 = *s1;
15670 u2 = *s2;
15671 if (u1 != u2)
15672 return (u1 < u2) ? -1 : +1;
15673 if (u1 == '\0')
15674 return 0;
15675 s1++;
15676 s2++;
15677 }
15678 return 0;
15679}
15680
15681Py_UNICODE*
15682Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15683{
15684 const Py_UNICODE *p;
15685 for (p = s; *p; p++)
15686 if (*p == c)
15687 return (Py_UNICODE*)p;
15688 return NULL;
15689}
15690
15691Py_UNICODE*
15692Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15693{
15694 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015695 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015696 while (p != s) {
15697 p--;
15698 if (*p == c)
15699 return (Py_UNICODE*)p;
15700 }
15701 return NULL;
15702}
Victor Stinner331ea922010-08-10 16:37:20 +000015703
Victor Stinner71133ff2010-09-01 23:43:53 +000015704Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015705PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015706{
Victor Stinner577db2c2011-10-11 22:12:48 +020015707 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015708 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015710 if (!PyUnicode_Check(unicode)) {
15711 PyErr_BadArgument();
15712 return NULL;
15713 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015714 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015715 if (u == NULL)
15716 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015717 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015718 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015719 PyErr_NoMemory();
15720 return NULL;
15721 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015722 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015723 size *= sizeof(Py_UNICODE);
15724 copy = PyMem_Malloc(size);
15725 if (copy == NULL) {
15726 PyErr_NoMemory();
15727 return NULL;
15728 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015729 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015730 return copy;
15731}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015732
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015733
Victor Stinner709d23d2019-05-02 14:56:30 -040015734static int
15735encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015736{
Victor Stinner709d23d2019-05-02 14:56:30 -040015737 int res;
15738 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15739 if (res == -2) {
15740 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15741 return -1;
15742 }
15743 if (res < 0) {
15744 PyErr_NoMemory();
15745 return -1;
15746 }
15747 return 0;
15748}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015749
Victor Stinner709d23d2019-05-02 14:56:30 -040015750
15751static int
15752config_get_codec_name(wchar_t **config_encoding)
15753{
15754 char *encoding;
15755 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15756 return -1;
15757 }
15758
15759 PyObject *name_obj = NULL;
15760 PyObject *codec = _PyCodec_Lookup(encoding);
15761 PyMem_RawFree(encoding);
15762
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015763 if (!codec)
15764 goto error;
15765
15766 name_obj = PyObject_GetAttrString(codec, "name");
15767 Py_CLEAR(codec);
15768 if (!name_obj) {
15769 goto error;
15770 }
15771
Victor Stinner709d23d2019-05-02 14:56:30 -040015772 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15773 Py_DECREF(name_obj);
15774 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015775 goto error;
15776 }
15777
Victor Stinner709d23d2019-05-02 14:56:30 -040015778 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15779 if (raw_wname == NULL) {
15780 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015781 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015782 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015783 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015784
15785 PyMem_RawFree(*config_encoding);
15786 *config_encoding = raw_wname;
15787
15788 PyMem_Free(wname);
15789 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015790
15791error:
15792 Py_XDECREF(codec);
15793 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015794 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015795}
15796
15797
Victor Stinner331a6a52019-05-27 16:39:22 +020015798static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015799init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015800{
Victor Stinner709d23d2019-05-02 14:56:30 -040015801 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015802 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015804 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015805 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015806 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015807 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808}
15809
15810
Victor Stinner709d23d2019-05-02 14:56:30 -040015811static int
15812init_fs_codec(PyInterpreterState *interp)
15813{
Victor Stinner331a6a52019-05-27 16:39:22 +020015814 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015815
15816 _Py_error_handler error_handler;
15817 error_handler = get_error_handler_wide(config->filesystem_errors);
15818 if (error_handler == _Py_ERROR_UNKNOWN) {
15819 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15820 return -1;
15821 }
15822
15823 char *encoding, *errors;
15824 if (encode_wstr_utf8(config->filesystem_encoding,
15825 &encoding,
15826 "filesystem_encoding") < 0) {
15827 return -1;
15828 }
15829
15830 if (encode_wstr_utf8(config->filesystem_errors,
15831 &errors,
15832 "filesystem_errors") < 0) {
15833 PyMem_RawFree(encoding);
15834 return -1;
15835 }
15836
15837 PyMem_RawFree(interp->fs_codec.encoding);
15838 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015839 /* encoding has been normalized by init_fs_encoding() */
15840 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015841 PyMem_RawFree(interp->fs_codec.errors);
15842 interp->fs_codec.errors = errors;
15843 interp->fs_codec.error_handler = error_handler;
15844
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015845#ifdef _Py_FORCE_UTF8_FS_ENCODING
15846 assert(interp->fs_codec.utf8 == 1);
15847#endif
15848
Victor Stinner709d23d2019-05-02 14:56:30 -040015849 /* At this point, PyUnicode_EncodeFSDefault() and
15850 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15851 the C implementation of the filesystem encoding. */
15852
15853 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15854 global configuration variables. */
15855 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15856 interp->fs_codec.errors) < 0) {
15857 PyErr_NoMemory();
15858 return -1;
15859 }
15860 return 0;
15861}
15862
15863
Victor Stinner331a6a52019-05-27 16:39:22 +020015864static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015865init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015866{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015867 PyInterpreterState *interp = tstate->interp;
15868
Victor Stinner709d23d2019-05-02 14:56:30 -040015869 /* Update the filesystem encoding to the normalized Python codec name.
15870 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15871 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015872 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015873 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015874 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015875 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015876 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015877 }
15878
Victor Stinner709d23d2019-05-02 14:56:30 -040015879 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015880 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015881 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015882 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015883}
15884
15885
Victor Stinner331a6a52019-05-27 16:39:22 +020015886PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015887_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015888{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015889 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015890 if (_PyStatus_EXCEPTION(status)) {
15891 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015892 }
15893
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015894 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015895}
15896
15897
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015898static void
15899_PyUnicode_FiniEncodings(PyThreadState *tstate)
15900{
15901 PyInterpreterState *interp = tstate->interp;
15902 PyMem_RawFree(interp->fs_codec.encoding);
15903 interp->fs_codec.encoding = NULL;
15904 interp->fs_codec.utf8 = 0;
15905 PyMem_RawFree(interp->fs_codec.errors);
15906 interp->fs_codec.errors = NULL;
15907 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15908}
15909
15910
Victor Stinner709d23d2019-05-02 14:56:30 -040015911#ifdef MS_WINDOWS
15912int
15913_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15914{
15915 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015916 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015917
15918 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15919 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15920 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15921 if (encoding == NULL || errors == NULL) {
15922 PyMem_RawFree(encoding);
15923 PyMem_RawFree(errors);
15924 PyErr_NoMemory();
15925 return -1;
15926 }
15927
15928 PyMem_RawFree(config->filesystem_encoding);
15929 config->filesystem_encoding = encoding;
15930 PyMem_RawFree(config->filesystem_errors);
15931 config->filesystem_errors = errors;
15932
15933 return init_fs_codec(interp);
15934}
15935#endif
15936
15937
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015938void
Victor Stinner3d483342019-11-22 12:27:50 +010015939_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015940{
Victor Stinner3d483342019-11-22 12:27:50 +010015941 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015942#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010015943 /* Insure++ is a memory analysis tool that aids in discovering
15944 * memory leaks and other memory problems. On Python exit, the
15945 * interned string dictionaries are flagged as being in use at exit
15946 * (which it is). Under normal circumstances, this is fine because
15947 * the memory will be automatically reclaimed by the system. Under
15948 * memory debugging, it's a huge source of useless noise, so we
15949 * trade off slower shutdown for less distraction in the memory
15950 * reports. -baw
15951 */
15952 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015953#endif /* __INSURE__ */
15954
Victor Stinner3d483342019-11-22 12:27:50 +010015955 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015956
Victor Stinner3d483342019-11-22 12:27:50 +010015957 for (Py_ssize_t i = 0; i < 256; i++) {
15958 Py_CLEAR(unicode_latin1[i]);
15959 }
15960 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015961 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015962
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015963 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015964}
15965
15966
Georg Brandl66c221e2010-10-14 07:04:07 +000015967/* A _string module, to export formatter_parser and formatter_field_name_split
15968 to the string.Formatter class implemented in Python. */
15969
15970static PyMethodDef _string_methods[] = {
15971 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15972 METH_O, PyDoc_STR("split the argument as a field name")},
15973 {"formatter_parser", (PyCFunction) formatter_parser,
15974 METH_O, PyDoc_STR("parse the argument as a format string")},
15975 {NULL, NULL}
15976};
15977
15978static struct PyModuleDef _string_module = {
15979 PyModuleDef_HEAD_INIT,
15980 "_string",
15981 PyDoc_STR("string helper module"),
15982 0,
15983 _string_methods,
15984 NULL,
15985 NULL,
15986 NULL,
15987 NULL
15988};
15989
15990PyMODINIT_FUNC
15991PyInit__string(void)
15992{
15993 return PyModule_Create(&_string_module);
15994}
15995
15996
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015997#ifdef __cplusplus
15998}
15999#endif