blob: aa874f2a12d29343480f6775ba063def0a63251d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020044#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040047#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010048#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000049#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050050#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
175 const from_type *_iter = (from_type *)(begin); \
176 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
968 else
969 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
974 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
975 else
976 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
979 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
980 else
981 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001040#ifdef Py_REF_DEBUG
1041 _Py_RefTotal--;
1042#endif
1043#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001044 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001045#endif
Victor Stinner84def372011-12-11 20:04:56 +01001046
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001047 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001048 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001049 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 PyErr_NoMemory();
1051 return NULL;
1052 }
Victor Stinner84def372011-12-11 20:04:56 +01001053 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001057 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001059 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001060 _PyUnicode_WSTR_LENGTH(unicode) = length;
1061 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001062 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1063 PyObject_DEL(_PyUnicode_WSTR(unicode));
1064 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001065 if (!PyUnicode_IS_ASCII(unicode))
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001067 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001068#ifdef Py_DEBUG
1069 unicode_fill_invalid(unicode, old_length);
1070#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1072 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001073 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 return unicode;
1075}
1076
Alexander Belopolsky40018472011-02-26 01:02:56 +00001077static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079{
Victor Stinner95663112011-10-04 01:03:50 +02001080 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001081 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001084
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 if (PyUnicode_IS_READY(unicode)) {
1086 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001087 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001089#ifdef Py_DEBUG
1090 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1091#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092
1093 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001094 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001095 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1096 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097
1098 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1099 PyErr_NoMemory();
1100 return -1;
1101 }
1102 new_size = (length + 1) * char_size;
1103
Victor Stinner7a9105a2011-12-12 00:13:42 +01001104 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1105 {
1106 PyObject_DEL(_PyUnicode_UTF8(unicode));
1107 _PyUnicode_UTF8(unicode) = NULL;
1108 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1109 }
1110
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 data = (PyObject *)PyObject_REALLOC(data, new_size);
1112 if (data == NULL) {
1113 PyErr_NoMemory();
1114 return -1;
1115 }
1116 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001117 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_WSTR_LENGTH(unicode) = length;
1120 }
1121 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001122 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 _PyUnicode_UTF8_LENGTH(unicode) = length;
1124 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_LENGTH(unicode) = length;
1126 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001127#ifdef Py_DEBUG
1128 unicode_fill_invalid(unicode, old_length);
1129#endif
Victor Stinner95663112011-10-04 01:03:50 +02001130 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001131 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001133 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 }
Victor Stinner95663112011-10-04 01:03:50 +02001135 assert(_PyUnicode_WSTR(unicode) != NULL);
1136
1137 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001138 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001139 PyErr_NoMemory();
1140 return -1;
1141 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001142 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001143 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001144 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001145 if (!wstr) {
1146 PyErr_NoMemory();
1147 return -1;
1148 }
1149 _PyUnicode_WSTR(unicode) = wstr;
1150 _PyUnicode_WSTR(unicode)[length] = 0;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001152 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return 0;
1154}
1155
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156static PyObject*
1157resize_copy(PyObject *unicode, Py_ssize_t length)
1158{
1159 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001160 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001162
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001163 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001164
1165 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1166 if (copy == NULL)
1167 return NULL;
1168
1169 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001170 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001172 }
1173 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001174 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001175
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001176 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 if (w == NULL)
1178 return NULL;
1179 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1180 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001181 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001182 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 }
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001188 Ux0000 terminated; some code (e.g. new_identifier)
1189 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001192 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193
1194*/
1195
Alexander Belopolsky40018472011-02-26 01:02:56 +00001196static PyUnicodeObject *
1197_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001199 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
Thomas Wouters477c8d52006-05-27 19:21:47 +00001202 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 if (length == 0 && unicode_empty != NULL) {
1204 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001205 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 }
1207
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001208 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001209 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001210 return (PyUnicodeObject *)PyErr_NoMemory();
1211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 if (length < 0) {
1213 PyErr_SetString(PyExc_SystemError,
1214 "Negative size passed to _PyUnicode_New");
1215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1219 if (unicode == NULL)
1220 return NULL;
1221 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001222
1223 _PyUnicode_WSTR_LENGTH(unicode) = length;
1224 _PyUnicode_HASH(unicode) = -1;
1225 _PyUnicode_STATE(unicode).interned = 0;
1226 _PyUnicode_STATE(unicode).kind = 0;
1227 _PyUnicode_STATE(unicode).compact = 0;
1228 _PyUnicode_STATE(unicode).ready = 0;
1229 _PyUnicode_STATE(unicode).ascii = 0;
1230 _PyUnicode_DATA_ANY(unicode) = NULL;
1231 _PyUnicode_LENGTH(unicode) = 0;
1232 _PyUnicode_UTF8(unicode) = NULL;
1233 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1236 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001237 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001239 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241
Jeremy Hyltond8082792003-09-16 19:41:39 +00001242 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001243 * the caller fails before initializing str -- unicode_resize()
1244 * reads str[0], and the Keep-Alive optimization can keep memory
1245 * allocated for str alive across a call to unicode_dealloc(unicode).
1246 * We don't want unicode_resize to read uninitialized memory in
1247 * that case.
1248 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_WSTR(unicode)[0] = 0;
1250 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001251
Victor Stinner7931d9a2011-11-04 00:22:48 +01001252 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 return unicode;
1254}
1255
Victor Stinnerf42dc442011-10-02 23:33:16 +02001256static const char*
1257unicode_kind_name(PyObject *unicode)
1258{
Victor Stinner42dfd712011-10-03 14:41:45 +02001259 /* don't check consistency: unicode_kind_name() is called from
1260 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001261 if (!PyUnicode_IS_COMPACT(unicode))
1262 {
1263 if (!PyUnicode_IS_READY(unicode))
1264 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001265 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266 {
1267 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001268 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001269 return "legacy ascii";
1270 else
1271 return "legacy latin1";
1272 case PyUnicode_2BYTE_KIND:
1273 return "legacy UCS2";
1274 case PyUnicode_4BYTE_KIND:
1275 return "legacy UCS4";
1276 default:
1277 return "<legacy invalid kind>";
1278 }
1279 }
1280 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001281 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "ascii";
1285 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001288 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 default:
1292 return "<invalid compact kind>";
1293 }
1294}
1295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001298char *_PyUnicode_utf8(void *unicode_raw){
1299 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001300 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_compact_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return _PyUnicode_COMPACT_DATA(unicode);
1306}
Victor Stinnera42de742018-11-22 10:25:22 +01001307void *_PyUnicode_data(void *unicode_raw) {
1308 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001309 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1311 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1312 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1313 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1314 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1315 return PyUnicode_DATA(unicode);
1316}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001317
1318void
1319_PyUnicode_Dump(PyObject *op)
1320{
1321 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1324 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001325
Victor Stinnera849a4b2011-10-03 12:12:11 +02001326 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001327 {
1328 if (ascii->state.ascii)
1329 data = (ascii + 1);
1330 else
1331 data = (compact + 1);
1332 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 else
1334 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1336 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 if (ascii->wstr == data)
1339 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001340 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001341
Victor Stinnera3b334d2011-10-03 13:53:37 +02001342 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001343 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1345 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001346 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001348 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001350}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351#endif
1352
1353PyObject *
1354PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1355{
1356 PyObject *obj;
1357 PyCompactUnicodeObject *unicode;
1358 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001359 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001360 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 Py_ssize_t char_size;
1362 Py_ssize_t struct_size;
1363
1364 /* Optimization for empty strings */
1365 if (size == 0 && unicode_empty != NULL) {
1366 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001367 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
1369
Victor Stinner9e9d6892011-10-04 01:02:02 +02001370 is_ascii = 0;
1371 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 struct_size = sizeof(PyCompactUnicodeObject);
1373 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001374 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 char_size = 1;
1376 is_ascii = 1;
1377 struct_size = sizeof(PyASCIIObject);
1378 }
1379 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 1;
1382 }
1383 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 2;
1386 if (sizeof(wchar_t) == 2)
1387 is_sharing = 1;
1388 }
1389 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001390 if (maxchar > MAX_UNICODE) {
1391 PyErr_SetString(PyExc_SystemError,
1392 "invalid maximum character passed to PyUnicode_New");
1393 return NULL;
1394 }
Victor Stinner8f825062012-04-27 13:55:39 +02001395 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 char_size = 4;
1397 if (sizeof(wchar_t) == 4)
1398 is_sharing = 1;
1399 }
1400
1401 /* Ensure we won't overflow the size. */
1402 if (size < 0) {
1403 PyErr_SetString(PyExc_SystemError,
1404 "Negative size passed to PyUnicode_New");
1405 return NULL;
1406 }
1407 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1408 return PyErr_NoMemory();
1409
1410 /* Duplicated allocation code from _PyObject_New() instead of a call to
1411 * PyObject_New() so we are able to allocate space for the object and
1412 * it's data buffer.
1413 */
1414 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1415 if (obj == NULL)
1416 return PyErr_NoMemory();
1417 obj = PyObject_INIT(obj, &PyUnicode_Type);
1418 if (obj == NULL)
1419 return NULL;
1420
1421 unicode = (PyCompactUnicodeObject *)obj;
1422 if (is_ascii)
1423 data = ((PyASCIIObject*)obj) + 1;
1424 else
1425 data = unicode + 1;
1426 _PyUnicode_LENGTH(unicode) = size;
1427 _PyUnicode_HASH(unicode) = -1;
1428 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001429 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 _PyUnicode_STATE(unicode).compact = 1;
1431 _PyUnicode_STATE(unicode).ready = 1;
1432 _PyUnicode_STATE(unicode).ascii = is_ascii;
1433 if (is_ascii) {
1434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 }
Victor Stinner8f825062012-04-27 13:55:39 +02001437 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 ((char*)data)[size] = 0;
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 else {
1445 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001446 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001447 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001449 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 ((Py_UCS4*)data)[size] = 0;
1451 if (is_sharing) {
1452 _PyUnicode_WSTR_LENGTH(unicode) = size;
1453 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1454 }
1455 else {
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 }
1459 }
Victor Stinner8f825062012-04-27 13:55:39 +02001460#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001461 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001462#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001463 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 return obj;
1465}
1466
1467#if SIZEOF_WCHAR_T == 2
1468/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1469 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001470 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 This function assumes that unicode can hold one more code point than wstr
1473 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001474static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477{
1478 const wchar_t *iter;
1479 Py_UCS4 *ucs4_out;
1480
Victor Stinner910337b2011-10-03 03:20:16 +02001481 assert(unicode != NULL);
1482 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1484 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1485
1486 for (iter = begin; iter < end; ) {
1487 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1488 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001489 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1490 && (iter+1) < end
1491 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 {
Victor Stinner551ac952011-11-29 22:58:13 +01001493 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 iter += 2;
1495 }
1496 else {
1497 *ucs4_out++ = *iter;
1498 iter++;
1499 }
1500 }
1501 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1502 _PyUnicode_GET_LENGTH(unicode)));
1503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504}
1505#endif
1506
Victor Stinnercd9950f2011-10-02 00:34:53 +02001507static int
Victor Stinner488fa492011-12-12 00:01:39 +01001508unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509{
Victor Stinner488fa492011-12-12 00:01:39 +01001510 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001511 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001512 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001513 return -1;
1514 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515 return 0;
1516}
1517
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518static int
1519_copy_characters(PyObject *to, Py_ssize_t to_start,
1520 PyObject *from, Py_ssize_t from_start,
1521 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001523 unsigned int from_kind, to_kind;
1524 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinneree4544c2012-05-09 22:24:08 +02001526 assert(0 <= how_many);
1527 assert(0 <= from_start);
1528 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001531 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinnerd3f08822012-05-29 12:57:52 +02001533 assert(PyUnicode_Check(to));
1534 assert(PyUnicode_IS_READY(to));
1535 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1536
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001537 if (how_many == 0)
1538 return 0;
1539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001541 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001543 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerf1852262012-06-16 16:38:26 +02001545#ifdef Py_DEBUG
1546 if (!check_maxchar
1547 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1548 {
1549 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1550 Py_UCS4 ch;
1551 Py_ssize_t i;
1552 for (i=0; i < how_many; i++) {
1553 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1554 assert(ch <= to_maxchar);
1555 }
1556 }
1557#endif
1558
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001560 if (check_maxchar
1561 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1562 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001563 /* Writing Latin-1 characters into an ASCII string requires to
1564 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001565 Py_UCS4 max_char;
1566 max_char = ucs1lib_find_max_char(from_data,
1567 (Py_UCS1*)from_data + how_many);
1568 if (max_char >= 128)
1569 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 }
Christian Heimesf051e432016-09-13 20:22:02 +02001571 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001572 (char*)from_data + from_kind * from_start,
1573 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001575 else if (from_kind == PyUnicode_1BYTE_KIND
1576 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001577 {
1578 _PyUnicode_CONVERT_BYTES(
1579 Py_UCS1, Py_UCS2,
1580 PyUnicode_1BYTE_DATA(from) + from_start,
1581 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1582 PyUnicode_2BYTE_DATA(to) + to_start
1583 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001584 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001585 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001586 && to_kind == PyUnicode_4BYTE_KIND)
1587 {
1588 _PyUnicode_CONVERT_BYTES(
1589 Py_UCS1, Py_UCS4,
1590 PyUnicode_1BYTE_DATA(from) + from_start,
1591 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1592 PyUnicode_4BYTE_DATA(to) + to_start
1593 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 }
1595 else if (from_kind == PyUnicode_2BYTE_KIND
1596 && to_kind == PyUnicode_4BYTE_KIND)
1597 {
1598 _PyUnicode_CONVERT_BYTES(
1599 Py_UCS2, Py_UCS4,
1600 PyUnicode_2BYTE_DATA(from) + from_start,
1601 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1602 PyUnicode_4BYTE_DATA(to) + to_start
1603 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001604 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001605 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001606 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1607
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001608 if (!check_maxchar) {
1609 if (from_kind == PyUnicode_2BYTE_KIND
1610 && to_kind == PyUnicode_1BYTE_KIND)
1611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS2, Py_UCS1,
1614 PyUnicode_2BYTE_DATA(from) + from_start,
1615 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_1BYTE_DATA(to) + to_start
1617 );
1618 }
1619 else if (from_kind == PyUnicode_4BYTE_KIND
1620 && to_kind == PyUnicode_1BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS4, Py_UCS1,
1624 PyUnicode_4BYTE_DATA(from) + from_start,
1625 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_1BYTE_DATA(to) + to_start
1627 );
1628 }
1629 else if (from_kind == PyUnicode_4BYTE_KIND
1630 && to_kind == PyUnicode_2BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS4, Py_UCS2,
1634 PyUnicode_4BYTE_DATA(from) + from_start,
1635 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_2BYTE_DATA(to) + to_start
1637 );
1638 }
1639 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001640 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 }
1642 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001643 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 Py_ssize_t i;
1647
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 for (i=0; i < how_many; i++) {
1649 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 if (ch > to_maxchar)
1651 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1653 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 }
1655 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return 0;
1657}
1658
Victor Stinnerd3f08822012-05-29 12:57:52 +02001659void
1660_PyUnicode_FastCopyCharacters(
1661 PyObject *to, Py_ssize_t to_start,
1662 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001663{
1664 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1665}
1666
1667Py_ssize_t
1668PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1669 PyObject *from, Py_ssize_t from_start,
1670 Py_ssize_t how_many)
1671{
1672 int err;
1673
1674 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1675 PyErr_BadInternalCall();
1676 return -1;
1677 }
1678
Benjamin Petersonbac79492012-01-14 13:34:47 -05001679 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001680 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001681 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001682 return -1;
1683
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001689 PyErr_SetString(PyExc_IndexError, "string index out of range");
1690 return -1;
1691 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if (how_many < 0) {
1693 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1694 return -1;
1695 }
1696 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1698 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001699 "Cannot write %zi characters at %zi "
1700 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001701 how_many, to_start, PyUnicode_GET_LENGTH(to));
1702 return -1;
1703 }
1704
1705 if (how_many == 0)
1706 return 0;
1707
Victor Stinner488fa492011-12-12 00:01:39 +01001708 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
1710
1711 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1712 if (err) {
1713 PyErr_Format(PyExc_SystemError,
1714 "Cannot copy %s characters "
1715 "into a string of %s characters",
1716 unicode_kind_name(from),
1717 unicode_kind_name(to));
1718 return -1;
1719 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001720 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721}
1722
Victor Stinner17222162011-09-28 22:15:37 +02001723/* Find the maximum code point and count the number of surrogate pairs so a
1724 correct string length can be computed before converting a string to UCS4.
1725 This function counts single surrogates as a character and not as a pair.
1726
1727 Return 0 on success, or -1 on error. */
1728static int
1729find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1730 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731{
1732 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001733 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Victor Stinnerc53be962011-10-02 21:33:54 +02001735 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 *num_surrogates = 0;
1737 *maxchar = 0;
1738
1739 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001741 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1742 && (iter+1) < end
1743 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1744 {
1745 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1746 ++(*num_surrogates);
1747 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 }
1749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001751 {
1752 ch = *iter;
1753 iter++;
1754 }
1755 if (ch > *maxchar) {
1756 *maxchar = ch;
1757 if (*maxchar > MAX_UNICODE) {
1758 PyErr_Format(PyExc_ValueError,
1759 "character U+%x is not in range [U+0000; U+10ffff]",
1760 ch);
1761 return -1;
1762 }
1763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 }
1765 return 0;
1766}
1767
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001768int
1769_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770{
1771 wchar_t *end;
1772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774#if SIZEOF_WCHAR_T == 2
1775 Py_ssize_t length_wo_surrogates;
1776#endif
1777
Georg Brandl7597add2011-10-05 16:36:47 +02001778 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001779 strings were created using _PyObject_New() and where no canonical
1780 representation (the str field) has been set yet aka strings
1781 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001782 assert(_PyUnicode_CHECK(unicode));
1783 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 /* Actually, it should neither be interned nor be anything else: */
1788 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001791 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794
1795 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001796 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1797 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 PyErr_NoMemory();
1799 return -1;
1800 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001801 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 _PyUnicode_WSTR(unicode), end,
1803 PyUnicode_1BYTE_DATA(unicode));
1804 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1805 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1806 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1807 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001808 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001809 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 }
1812 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001813 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 _PyUnicode_UTF8(unicode) = NULL;
1815 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 }
1817 PyObject_FREE(_PyUnicode_WSTR(unicode));
1818 _PyUnicode_WSTR(unicode) = NULL;
1819 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1820 }
1821 /* In this case we might have to convert down from 4-byte native
1822 wchar_t to 2-byte unicode. */
1823 else if (maxchar < 65536) {
1824 assert(num_surrogates == 0 &&
1825 "FindMaxCharAndNumSurrogatePairs() messed up");
1826
Victor Stinner506f5922011-09-28 22:34:18 +02001827#if SIZEOF_WCHAR_T == 2
1828 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001830 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1831 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1832 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001835#else
1836 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001838 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001839 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyErr_NoMemory();
1841 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 }
Victor Stinner506f5922011-09-28 22:34:18 +02001843 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1844 _PyUnicode_WSTR(unicode), end,
1845 PyUnicode_2BYTE_DATA(unicode));
1846 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1847 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1848 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001849 _PyUnicode_UTF8(unicode) = NULL;
1850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001851 PyObject_FREE(_PyUnicode_WSTR(unicode));
1852 _PyUnicode_WSTR(unicode) = NULL;
1853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1854#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1857 else {
1858#if SIZEOF_WCHAR_T == 2
1859 /* in case the native representation is 2-bytes, we need to allocate a
1860 new normalized 4-byte version. */
1861 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001862 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1863 PyErr_NoMemory();
1864 return -1;
1865 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001866 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1867 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyErr_NoMemory();
1869 return -1;
1870 }
1871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001873 _PyUnicode_UTF8(unicode) = NULL;
1874 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001875 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1876 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001877 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 PyObject_FREE(_PyUnicode_WSTR(unicode));
1879 _PyUnicode_WSTR(unicode) = NULL;
1880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1881#else
1882 assert(num_surrogates == 0);
1883
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001886 _PyUnicode_UTF8(unicode) = NULL;
1887 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1889#endif
1890 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1891 }
1892 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return 0;
1895}
1896
Alexander Belopolsky40018472011-02-26 01:02:56 +00001897static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001898unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899{
Walter Dörwald16807132007-05-25 13:52:07 +00001900 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_NOT_INTERNED:
1902 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001903
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 case SSTATE_INTERNED_MORTAL:
1905 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001906 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001907 if (PyDict_DelItem(interned, unicode) != 0) {
1908 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1909 NULL);
1910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1915 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001916
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001918 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001919 }
1920
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001921 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001923 }
1924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001925 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 }
1927 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001928 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001931 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932}
1933
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001934#ifdef Py_DEBUG
1935static int
1936unicode_is_singleton(PyObject *unicode)
1937{
1938 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1939 if (unicode == unicode_empty)
1940 return 1;
1941 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1942 {
1943 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1944 if (ch < 256 && unicode_latin1[ch] == unicode)
1945 return 1;
1946 }
1947 return 0;
1948}
1949#endif
1950
Alexander Belopolsky40018472011-02-26 01:02:56 +00001951static int
Victor Stinner488fa492011-12-12 00:01:39 +01001952unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001953{
Victor Stinner488fa492011-12-12 00:01:39 +01001954 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955 if (Py_REFCNT(unicode) != 1)
1956 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001957 if (_PyUnicode_HASH(unicode) != -1)
1958 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001959 if (PyUnicode_CHECK_INTERNED(unicode))
1960 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001961 if (!PyUnicode_CheckExact(unicode))
1962 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001963#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001964 /* singleton refcount is greater than 1 */
1965 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001966#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 return 1;
1968}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001969
Victor Stinnerfe226c02011-10-03 03:52:20 +02001970static int
1971unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1972{
1973 PyObject *unicode;
1974 Py_ssize_t old_length;
1975
1976 assert(p_unicode != NULL);
1977 unicode = *p_unicode;
1978
1979 assert(unicode != NULL);
1980 assert(PyUnicode_Check(unicode));
1981 assert(0 <= length);
1982
Victor Stinner910337b2011-10-03 03:20:16 +02001983 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 old_length = PyUnicode_WSTR_LENGTH(unicode);
1985 else
1986 old_length = PyUnicode_GET_LENGTH(unicode);
1987 if (old_length == length)
1988 return 0;
1989
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001990 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001991 _Py_INCREF_UNICODE_EMPTY();
1992 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001994 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001995 return 0;
1996 }
1997
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001999 PyObject *copy = resize_copy(unicode, length);
2000 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004 }
2005
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002007 PyObject *new_unicode = resize_compact(unicode, length);
2008 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002010 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002013 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002014}
2015
Alexander Belopolsky40018472011-02-26 01:02:56 +00002016int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002018{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 PyObject *unicode;
2020 if (p_unicode == NULL) {
2021 PyErr_BadInternalCall();
2022 return -1;
2023 }
2024 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 {
2027 PyErr_BadInternalCall();
2028 return -1;
2029 }
2030 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002031}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002032
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002033/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002034
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002035 WARNING: The function doesn't copy the terminating null character and
2036 doesn't check the maximum character (may write a latin1 character in an
2037 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002038static void
2039unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2040 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002041{
2042 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2043 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002044 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002045
2046 switch (kind) {
2047 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002048 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002049#ifdef Py_DEBUG
2050 if (PyUnicode_IS_ASCII(unicode)) {
2051 Py_UCS4 maxchar = ucs1lib_find_max_char(
2052 (const Py_UCS1*)str,
2053 (const Py_UCS1*)str + len);
2054 assert(maxchar < 128);
2055 }
2056#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002057 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002058 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002059 }
2060 case PyUnicode_2BYTE_KIND: {
2061 Py_UCS2 *start = (Py_UCS2 *)data + index;
2062 Py_UCS2 *ucs2 = start;
2063 assert(index <= PyUnicode_GET_LENGTH(unicode));
2064
Victor Stinner184252a2012-06-16 02:57:41 +02002065 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 *ucs2 = (Py_UCS2)*str;
2067
2068 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002069 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002070 }
2071 default: {
2072 Py_UCS4 *start = (Py_UCS4 *)data + index;
2073 Py_UCS4 *ucs4 = start;
2074 assert(kind == PyUnicode_4BYTE_KIND);
2075 assert(index <= PyUnicode_GET_LENGTH(unicode));
2076
Victor Stinner184252a2012-06-16 02:57:41 +02002077 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002078 *ucs4 = (Py_UCS4)*str;
2079
2080 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002081 }
2082 }
2083}
2084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085static PyObject*
2086get_latin1_char(unsigned char ch)
2087{
Victor Stinnera464fc12011-10-02 20:39:30 +02002088 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002090 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (!unicode)
2092 return NULL;
2093 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002094 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 unicode_latin1[ch] = unicode;
2096 }
2097 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099}
2100
Victor Stinner985a82a2014-01-03 12:53:47 +01002101static PyObject*
2102unicode_char(Py_UCS4 ch)
2103{
2104 PyObject *unicode;
2105
2106 assert(ch <= MAX_UNICODE);
2107
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002108 if (ch < 256)
2109 return get_latin1_char(ch);
2110
Victor Stinner985a82a2014-01-03 12:53:47 +01002111 unicode = PyUnicode_New(1, ch);
2112 if (unicode == NULL)
2113 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002114
2115 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2116 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002117 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002118 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2120 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2121 }
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
2124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002129 if (u == NULL)
2130 return (PyObject*)_PyUnicode_New(size);
2131
2132 if (size < 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 return PyUnicode_FromWideChar(u, size);
2138}
2139
2140PyObject *
2141PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002143 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 Py_UCS4 maxchar = 0;
2145 Py_ssize_t num_surrogates;
2146
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002147 if (u == NULL && size != 0) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
2151
2152 if (size == -1) {
2153 size = wcslen(u);
2154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 /* If the Unicode data is known at construction time, we can apply
2157 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 /* Single character Unicode objects in the Latin-1 range are
2164 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002165 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return get_latin1_char((unsigned char)*u);
2167
2168 /* If not empty and not single character, copy the Unicode data
2169 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002170 if (find_maxchar_surrogates(u, u + size,
2171 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return NULL;
2173
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!unicode)
2176 return NULL;
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 switch (PyUnicode_KIND(unicode)) {
2179 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002180 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2182 break;
2183 case PyUnicode_2BYTE_KIND:
2184#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002185 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002187 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2189#endif
2190 break;
2191 case PyUnicode_4BYTE_KIND:
2192#if SIZEOF_WCHAR_T == 2
2193 /* This is the only case which has to process surrogates, thus
2194 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002195 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196#else
2197 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002198 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199#endif
2200 break;
2201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002202 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206}
2207
Alexander Belopolsky40018472011-02-26 01:02:56 +00002208PyObject *
2209PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002210{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 if (size < 0) {
2212 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 return NULL;
2215 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002216 if (u != NULL)
2217 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2218 else
2219 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002220}
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224{
2225 size_t size = strlen(u);
2226 if (size > PY_SSIZE_T_MAX) {
2227 PyErr_SetString(PyExc_OverflowError, "input too long");
2228 return NULL;
2229 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002230 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231}
2232
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002233PyObject *
2234_PyUnicode_FromId(_Py_Identifier *id)
2235{
2236 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002237 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2238 strlen(id->string),
2239 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002240 if (!id->object)
2241 return NULL;
2242 PyUnicode_InternInPlace(&id->object);
2243 assert(!id->next);
2244 id->next = static_strings;
2245 static_strings = id;
2246 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002247 return id->object;
2248}
2249
2250void
2251_PyUnicode_ClearStaticStrings()
2252{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002253 _Py_Identifier *tmp, *s = static_strings;
2254 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002255 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002256 tmp = s->next;
2257 s->next = NULL;
2258 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002259 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002260 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002261}
2262
Benjamin Peterson0df54292012-03-26 14:50:32 -04002263/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002264
Victor Stinnerd3f08822012-05-29 12:57:52 +02002265PyObject*
2266_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002267{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002268 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002269 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002270 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002271#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002272 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002273#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002274 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002275 }
Victor Stinner785938e2011-12-11 20:09:03 +01002276 unicode = PyUnicode_New(size, 127);
2277 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002278 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002279 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2280 assert(_PyUnicode_CheckConsistency(unicode, 1));
2281 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002282}
2283
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002284static Py_UCS4
2285kind_maxchar_limit(unsigned int kind)
2286{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002287 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002288 case PyUnicode_1BYTE_KIND:
2289 return 0x80;
2290 case PyUnicode_2BYTE_KIND:
2291 return 0x100;
2292 case PyUnicode_4BYTE_KIND:
2293 return 0x10000;
2294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002295 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 }
2297}
2298
Victor Stinner702c7342011-10-05 13:50:52 +02002299static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002300_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002303 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002304
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 if (size == 0)
2306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002307 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002308 if (size == 1)
2309 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 if (!res)
2314 return NULL;
2315 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002316 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318}
2319
Victor Stinnere57b1c02011-09-28 22:20:48 +02002320static PyObject*
2321_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322{
2323 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002324 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002325
Serhiy Storchaka678db842013-01-26 12:16:36 +02002326 if (size == 0)
2327 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002328 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002329 if (size == 1)
2330 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002331
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 if (!res)
2335 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002338 else {
2339 _PyUnicode_CONVERT_BYTES(
2340 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2341 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002342 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 return res;
2344}
2345
Victor Stinnere57b1c02011-09-28 22:20:48 +02002346static PyObject*
2347_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348{
2349 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351
Serhiy Storchaka678db842013-01-26 12:16:36 +02002352 if (size == 0)
2353 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002354 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002355 if (size == 1)
2356 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 if (!res)
2361 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002362 if (max_char < 256)
2363 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2364 PyUnicode_1BYTE_DATA(res));
2365 else if (max_char < 0x10000)
2366 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2367 PyUnicode_2BYTE_DATA(res));
2368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002370 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return res;
2372}
2373
2374PyObject*
2375PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002377 if (size < 0) {
2378 PyErr_SetString(PyExc_ValueError, "size must be positive");
2379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002383 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002385 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002387 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002388 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002389 PyErr_SetString(PyExc_SystemError, "invalid kind");
2390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392}
2393
Victor Stinnerece58de2012-04-23 23:36:38 +02002394Py_UCS4
2395_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396{
2397 enum PyUnicode_Kind kind;
2398 void *startptr, *endptr;
2399
2400 assert(PyUnicode_IS_READY(unicode));
2401 assert(0 <= start);
2402 assert(end <= PyUnicode_GET_LENGTH(unicode));
2403 assert(start <= end);
2404
2405 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2406 return PyUnicode_MAX_CHAR_VALUE(unicode);
2407
2408 if (start == end)
2409 return 127;
2410
Victor Stinner94d558b2012-04-27 22:26:58 +02002411 if (PyUnicode_IS_ASCII(unicode))
2412 return 127;
2413
Victor Stinnerece58de2012-04-23 23:36:38 +02002414 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002415 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002416 endptr = (char *)startptr + end * kind;
2417 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002418 switch(kind) {
2419 case PyUnicode_1BYTE_KIND:
2420 return ucs1lib_find_max_char(startptr, endptr);
2421 case PyUnicode_2BYTE_KIND:
2422 return ucs2lib_find_max_char(startptr, endptr);
2423 case PyUnicode_4BYTE_KIND:
2424 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002425 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002426 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002427 }
2428}
2429
Victor Stinner25a4b292011-10-06 12:31:55 +02002430/* Ensure that a string uses the most efficient storage, if it is not the
2431 case: create a new string with of the right kind. Write NULL into *p_unicode
2432 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002433static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002434unicode_adjust_maxchar(PyObject **p_unicode)
2435{
2436 PyObject *unicode, *copy;
2437 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002438 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 unsigned int kind;
2440
2441 assert(p_unicode != NULL);
2442 unicode = *p_unicode;
2443 assert(PyUnicode_IS_READY(unicode));
2444 if (PyUnicode_IS_ASCII(unicode))
2445 return;
2446
2447 len = PyUnicode_GET_LENGTH(unicode);
2448 kind = PyUnicode_KIND(unicode);
2449 if (kind == PyUnicode_1BYTE_KIND) {
2450 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002451 max_char = ucs1lib_find_max_char(u, u + len);
2452 if (max_char >= 128)
2453 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002454 }
2455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002457 max_char = ucs2lib_find_max_char(u, u + len);
2458 if (max_char >= 256)
2459 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 }
2461 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002462 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002464 max_char = ucs4lib_find_max_char(u, u + len);
2465 if (max_char >= 0x10000)
2466 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002467 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002468 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002469 if (copy != NULL)
2470 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002471 Py_DECREF(unicode);
2472 *p_unicode = copy;
2473}
2474
Victor Stinner034f6cf2011-09-30 02:26:44 +02002475PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002476_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477{
Victor Stinner87af4f22011-11-21 23:03:47 +01002478 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002479 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002480
Victor Stinner034f6cf2011-09-30 02:26:44 +02002481 if (!PyUnicode_Check(unicode)) {
2482 PyErr_BadInternalCall();
2483 return NULL;
2484 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002485 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002486 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002487
Victor Stinner87af4f22011-11-21 23:03:47 +01002488 length = PyUnicode_GET_LENGTH(unicode);
2489 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490 if (!copy)
2491 return NULL;
2492 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2493
Christian Heimesf051e432016-09-13 20:22:02 +02002494 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002495 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002496 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002497 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500
Victor Stinnerbc603d12011-10-02 01:00:40 +02002501/* Widen Unicode objects to larger buffers. Don't write terminating null
2502 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503
2504void*
2505_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2506{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002507 Py_ssize_t len;
2508 void *result;
2509 unsigned int skind;
2510
Benjamin Petersonbac79492012-01-14 13:34:47 -05002511 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002512 return NULL;
2513
2514 len = PyUnicode_GET_LENGTH(s);
2515 skind = PyUnicode_KIND(s);
2516 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002517 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 return NULL;
2519 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002520 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 if (!result)
2524 return PyErr_NoMemory();
2525 assert(skind == PyUnicode_1BYTE_KIND);
2526 _PyUnicode_CONVERT_BYTES(
2527 Py_UCS1, Py_UCS2,
2528 PyUnicode_1BYTE_DATA(s),
2529 PyUnicode_1BYTE_DATA(s) + len,
2530 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002533 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 if (!result)
2535 return PyErr_NoMemory();
2536 if (skind == PyUnicode_2BYTE_KIND) {
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS2, Py_UCS4,
2539 PyUnicode_2BYTE_DATA(s),
2540 PyUnicode_2BYTE_DATA(s) + len,
2541 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 else {
2544 assert(skind == PyUnicode_1BYTE_KIND);
2545 _PyUnicode_CONVERT_BYTES(
2546 Py_UCS1, Py_UCS4,
2547 PyUnicode_1BYTE_DATA(s),
2548 PyUnicode_1BYTE_DATA(s) + len,
2549 result);
2550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552 default:
2553 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 }
Victor Stinner01698042011-10-04 00:04:26 +02002555 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 return NULL;
2557}
2558
2559static Py_UCS4*
2560as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2561 int copy_null)
2562{
2563 int kind;
2564 void *data;
2565 Py_ssize_t len, targetlen;
2566 if (PyUnicode_READY(string) == -1)
2567 return NULL;
2568 kind = PyUnicode_KIND(string);
2569 data = PyUnicode_DATA(string);
2570 len = PyUnicode_GET_LENGTH(string);
2571 targetlen = len;
2572 if (copy_null)
2573 targetlen++;
2574 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 if (!target) {
2577 PyErr_NoMemory();
2578 return NULL;
2579 }
2580 }
2581 else {
2582 if (targetsize < targetlen) {
2583 PyErr_Format(PyExc_SystemError,
2584 "string is longer than the buffer");
2585 if (copy_null && 0 < targetsize)
2586 target[0] = 0;
2587 return NULL;
2588 }
2589 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002590 if (kind == PyUnicode_1BYTE_KIND) {
2591 Py_UCS1 *start = (Py_UCS1 *) data;
2592 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002594 else if (kind == PyUnicode_2BYTE_KIND) {
2595 Py_UCS2 *start = (Py_UCS2 *) data;
2596 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2597 }
2598 else {
2599 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002600 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 if (copy_null)
2603 target[len] = 0;
2604 return target;
2605}
2606
2607Py_UCS4*
2608PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2609 int copy_null)
2610{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002611 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 PyErr_BadInternalCall();
2613 return NULL;
2614 }
2615 return as_ucs4(string, target, targetsize, copy_null);
2616}
2617
2618Py_UCS4*
2619PyUnicode_AsUCS4Copy(PyObject *string)
2620{
2621 return as_ucs4(string, NULL, 0, 1);
2622}
2623
Victor Stinner15a11362012-10-06 23:48:20 +02002624/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002625 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2626 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2627#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629static int
2630unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2631 Py_ssize_t width, Py_ssize_t precision)
2632{
2633 Py_ssize_t length, fill, arglen;
2634 Py_UCS4 maxchar;
2635
2636 if (PyUnicode_READY(str) == -1)
2637 return -1;
2638
2639 length = PyUnicode_GET_LENGTH(str);
2640 if ((precision == -1 || precision >= length)
2641 && width <= length)
2642 return _PyUnicodeWriter_WriteStr(writer, str);
2643
2644 if (precision != -1)
2645 length = Py_MIN(precision, length);
2646
2647 arglen = Py_MAX(length, width);
2648 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2649 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2650 else
2651 maxchar = writer->maxchar;
2652
2653 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2654 return -1;
2655
2656 if (width > length) {
2657 fill = width - length;
2658 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2659 return -1;
2660 writer->pos += fill;
2661 }
2662
2663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2664 str, 0, length);
2665 writer->pos += length;
2666 return 0;
2667}
2668
2669static int
Victor Stinner998b8062018-09-12 00:23:25 +02002670unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002671 Py_ssize_t width, Py_ssize_t precision)
2672{
2673 /* UTF-8 */
2674 Py_ssize_t length;
2675 PyObject *unicode;
2676 int res;
2677
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002678 if (precision == -1) {
2679 length = strlen(str);
2680 }
2681 else {
2682 length = 0;
2683 while (length < precision && str[length]) {
2684 length++;
2685 }
2686 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002687 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2688 if (unicode == NULL)
2689 return -1;
2690
2691 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2692 Py_DECREF(unicode);
2693 return res;
2694}
2695
Victor Stinner96865452011-03-01 23:44:09 +00002696static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002697unicode_fromformat_arg(_PyUnicodeWriter *writer,
2698 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002699{
Victor Stinnere215d962012-10-06 23:03:36 +02002700 const char *p;
2701 Py_ssize_t len;
2702 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002703 Py_ssize_t width;
2704 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002705 int longflag;
2706 int longlongflag;
2707 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002708 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002709
2710 p = f;
2711 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002712 zeropad = 0;
2713 if (*f == '0') {
2714 zeropad = 1;
2715 f++;
2716 }
Victor Stinner96865452011-03-01 23:44:09 +00002717
2718 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 width = -1;
2720 if (Py_ISDIGIT((unsigned)*f)) {
2721 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002722 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002723 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002724 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002725 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002727 return NULL;
2728 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002730 f++;
2731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732 }
2733 precision = -1;
2734 if (*f == '.') {
2735 f++;
2736 if (Py_ISDIGIT((unsigned)*f)) {
2737 precision = (*f - '0');
2738 f++;
2739 while (Py_ISDIGIT((unsigned)*f)) {
2740 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2741 PyErr_SetString(PyExc_ValueError,
2742 "precision too big");
2743 return NULL;
2744 }
2745 precision = (precision * 10) + (*f - '0');
2746 f++;
2747 }
2748 }
Victor Stinner96865452011-03-01 23:44:09 +00002749 if (*f == '%') {
2750 /* "%.3%s" => f points to "3" */
2751 f--;
2752 }
2753 }
2754 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002755 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002756 f--;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758
2759 /* Handle %ld, %lu, %lld and %llu. */
2760 longflag = 0;
2761 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002762 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002763 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002764 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002765 longflag = 1;
2766 ++f;
2767 }
Victor Stinner96865452011-03-01 23:44:09 +00002768 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002769 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002770 longlongflag = 1;
2771 f += 2;
2772 }
Victor Stinner96865452011-03-01 23:44:09 +00002773 }
2774 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002775 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002776 size_tflag = 1;
2777 ++f;
2778 }
Victor Stinnere215d962012-10-06 23:03:36 +02002779
2780 if (f[1] == '\0')
2781 writer->overallocate = 0;
2782
2783 switch (*f) {
2784 case 'c':
2785 {
2786 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002787 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002788 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002789 "character argument not in range(0x110000)");
2790 return NULL;
2791 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002792 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002793 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 break;
2795 }
2796
2797 case 'i':
2798 case 'd':
2799 case 'u':
2800 case 'x':
2801 {
2802 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002803 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002805
2806 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002807 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002808 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002809 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002810 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002811 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002812 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002813 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002814 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002815 va_arg(*vargs, size_t));
2816 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002817 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002818 va_arg(*vargs, unsigned int));
2819 }
2820 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002821 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002822 }
2823 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002825 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002826 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002827 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002828 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002829 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002830 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002831 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002832 va_arg(*vargs, Py_ssize_t));
2833 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002834 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002835 va_arg(*vargs, int));
2836 }
2837 assert(len >= 0);
2838
Victor Stinnere215d962012-10-06 23:03:36 +02002839 if (precision < len)
2840 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841
2842 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2844 return NULL;
2845
Victor Stinnere215d962012-10-06 23:03:36 +02002846 if (width > precision) {
2847 Py_UCS4 fillchar;
2848 fill = width - precision;
2849 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002850 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2851 return NULL;
2852 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 }
Victor Stinner15a11362012-10-06 23:48:20 +02002854 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002855 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002856 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2857 return NULL;
2858 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002859 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002860
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864 }
2865
2866 case 'p':
2867 {
2868 char number[MAX_LONG_LONG_CHARS];
2869
2870 len = sprintf(number, "%p", va_arg(*vargs, void*));
2871 assert(len >= 0);
2872
2873 /* %p is ill-defined: ensure leading 0x. */
2874 if (number[1] == 'X')
2875 number[1] = 'x';
2876 else if (number[1] != 'x') {
2877 memmove(number + 2, number,
2878 strlen(number) + 1);
2879 number[0] = '0';
2880 number[1] = 'x';
2881 len += 2;
2882 }
2883
Victor Stinner4a587072013-11-19 12:54:53 +01002884 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002885 return NULL;
2886 break;
2887 }
2888
2889 case 's':
2890 {
2891 /* UTF-8 */
2892 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002893 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002894 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002895 break;
2896 }
2897
2898 case 'U':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 assert(obj && _PyUnicode_CHECK(obj));
2902
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002904 return NULL;
2905 break;
2906 }
2907
2908 case 'V':
2909 {
2910 PyObject *obj = va_arg(*vargs, PyObject *);
2911 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002912 if (obj) {
2913 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return NULL;
2916 }
2917 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002919 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 }
2922 break;
2923 }
2924
2925 case 'S':
2926 {
2927 PyObject *obj = va_arg(*vargs, PyObject *);
2928 PyObject *str;
2929 assert(obj);
2930 str = PyObject_Str(obj);
2931 if (!str)
2932 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002933 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002934 Py_DECREF(str);
2935 return NULL;
2936 }
2937 Py_DECREF(str);
2938 break;
2939 }
2940
2941 case 'R':
2942 {
2943 PyObject *obj = va_arg(*vargs, PyObject *);
2944 PyObject *repr;
2945 assert(obj);
2946 repr = PyObject_Repr(obj);
2947 if (!repr)
2948 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002950 Py_DECREF(repr);
2951 return NULL;
2952 }
2953 Py_DECREF(repr);
2954 break;
2955 }
2956
2957 case 'A':
2958 {
2959 PyObject *obj = va_arg(*vargs, PyObject *);
2960 PyObject *ascii;
2961 assert(obj);
2962 ascii = PyObject_ASCII(obj);
2963 if (!ascii)
2964 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002965 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002966 Py_DECREF(ascii);
2967 return NULL;
2968 }
2969 Py_DECREF(ascii);
2970 break;
2971 }
2972
2973 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002974 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002975 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002976 break;
2977
2978 default:
2979 /* if we stumble upon an unknown formatting code, copy the rest
2980 of the format string to the output string. (we cannot just
2981 skip the code, since there's no way to know what's in the
2982 argument list) */
2983 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002984 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002985 return NULL;
2986 f = p+len;
2987 return f;
2988 }
2989
2990 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002991 return f;
2992}
2993
Walter Dörwaldd2034312007-05-18 16:29:38 +00002994PyObject *
2995PyUnicode_FromFormatV(const char *format, va_list vargs)
2996{
Victor Stinnere215d962012-10-06 23:03:36 +02002997 va_list vargs2;
2998 const char *f;
2999 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000
Victor Stinner8f674cc2013-04-17 23:02:17 +02003001 _PyUnicodeWriter_Init(&writer);
3002 writer.min_length = strlen(format) + 100;
3003 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003004
Benjamin Peterson0c212142016-09-20 20:39:33 -07003005 // Copy varags to be able to pass a reference to a subfunction.
3006 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003007
3008 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003009 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003010 f = unicode_fromformat_arg(&writer, f, &vargs2);
3011 if (f == NULL)
3012 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003015 const char *p;
3016 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003017
Victor Stinnere215d962012-10-06 23:03:36 +02003018 p = f;
3019 do
3020 {
3021 if ((unsigned char)*p > 127) {
3022 PyErr_Format(PyExc_ValueError,
3023 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3024 "string, got a non-ASCII byte: 0x%02x",
3025 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003026 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003027 }
3028 p++;
3029 }
3030 while (*p != '\0' && *p != '%');
3031 len = p - f;
3032
3033 if (*p == '\0')
3034 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003035
3036 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003037 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003038
3039 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003041 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003042 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003043 return _PyUnicodeWriter_Finish(&writer);
3044
3045 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003046 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003047 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003048 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003049}
3050
Walter Dörwaldd2034312007-05-18 16:29:38 +00003051PyObject *
3052PyUnicode_FromFormat(const char *format, ...)
3053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003054 PyObject* ret;
3055 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056
3057#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003059#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003061#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003062 ret = PyUnicode_FromFormatV(format, vargs);
3063 va_end(vargs);
3064 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003065}
3066
Serhiy Storchakac46db922018-10-23 22:58:24 +03003067static Py_ssize_t
3068unicode_get_widechar_size(PyObject *unicode)
3069{
3070 Py_ssize_t res;
3071
3072 assert(unicode != NULL);
3073 assert(_PyUnicode_CHECK(unicode));
3074
3075 if (_PyUnicode_WSTR(unicode) != NULL) {
3076 return PyUnicode_WSTR_LENGTH(unicode);
3077 }
3078 assert(PyUnicode_IS_READY(unicode));
3079
3080 res = _PyUnicode_LENGTH(unicode);
3081#if SIZEOF_WCHAR_T == 2
3082 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3083 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3084 const Py_UCS4 *end = s + res;
3085 for (; s < end; ++s) {
3086 if (*s > 0xFFFF) {
3087 ++res;
3088 }
3089 }
3090 }
3091#endif
3092 return res;
3093}
3094
3095static void
3096unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3097{
3098 const wchar_t *wstr;
3099
3100 assert(unicode != NULL);
3101 assert(_PyUnicode_CHECK(unicode));
3102
3103 wstr = _PyUnicode_WSTR(unicode);
3104 if (wstr != NULL) {
3105 memcpy(w, wstr, size * sizeof(wchar_t));
3106 return;
3107 }
3108 assert(PyUnicode_IS_READY(unicode));
3109
3110 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3111 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3112 for (; size--; ++s, ++w) {
3113 *w = *s;
3114 }
3115 }
3116 else {
3117#if SIZEOF_WCHAR_T == 4
3118 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3119 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3120 for (; size--; ++s, ++w) {
3121 *w = *s;
3122 }
3123#else
3124 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3125 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3126 for (; size--; ++s, ++w) {
3127 Py_UCS4 ch = *s;
3128 if (ch > 0xFFFF) {
3129 assert(ch <= MAX_UNICODE);
3130 /* encode surrogate pair in this case */
3131 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3132 if (!size--)
3133 break;
3134 *w = Py_UNICODE_LOW_SURROGATE(ch);
3135 }
3136 else {
3137 *w = ch;
3138 }
3139 }
3140#endif
3141 }
3142}
3143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144#ifdef HAVE_WCHAR_H
3145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003147
Victor Stinnerd88d9832011-09-06 02:00:05 +02003148 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003149 character) required to convert the unicode object. Ignore size argument.
3150
Victor Stinnerd88d9832011-09-06 02:00:05 +02003151 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003152 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003153 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003154Py_ssize_t
3155PyUnicode_AsWideChar(PyObject *unicode,
3156 wchar_t *w,
3157 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003158{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003159 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003161 if (unicode == NULL) {
3162 PyErr_BadInternalCall();
3163 return -1;
3164 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003168 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003169
3170 res = unicode_get_widechar_size(unicode);
3171 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003172 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003173 }
3174
3175 if (size > res) {
3176 size = res + 1;
3177 }
3178 else {
3179 res = size;
3180 }
3181 unicode_copy_as_widechar(unicode, w, size);
3182 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003183}
3184
Victor Stinner137c34c2010-09-29 10:25:54 +00003185wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003186PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003187 Py_ssize_t *size)
3188{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003190 Py_ssize_t buflen;
3191
3192 if (unicode == NULL) {
3193 PyErr_BadInternalCall();
3194 return NULL;
3195 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003198 return NULL;
3199 }
3200
Serhiy Storchakac46db922018-10-23 22:58:24 +03003201 buflen = unicode_get_widechar_size(unicode);
3202 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003203 if (buffer == NULL) {
3204 PyErr_NoMemory();
3205 return NULL;
3206 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3208 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003209 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003210 }
3211 else if (wcslen(buffer) != (size_t)buflen) {
3212 PyMem_FREE(buffer);
3213 PyErr_SetString(PyExc_ValueError,
3214 "embedded null character");
3215 return NULL;
3216 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003217 return buffer;
3218}
3219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003220#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221
Alexander Belopolsky40018472011-02-26 01:02:56 +00003222PyObject *
3223PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003224{
Victor Stinner8faf8212011-12-08 22:14:11 +01003225 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 PyErr_SetString(PyExc_ValueError,
3227 "chr() arg not in range(0x110000)");
3228 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003229 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003230
Victor Stinner985a82a2014-01-03 12:53:47 +01003231 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003235PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003237 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003239 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003240 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003241 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_INCREF(obj);
3243 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003244 }
3245 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 /* For a Unicode subtype that's not a Unicode object,
3247 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003248 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003249 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003250 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003251 "Can't convert '%.100s' object to str implicitly",
3252 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003253 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003257PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003260{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003261 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003262 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 PyErr_BadInternalCall();
3266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003268
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003269 /* Decoding bytes objects is the most common case and should be fast */
3270 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003271 if (PyBytes_GET_SIZE(obj) == 0) {
3272 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3273 return NULL;
3274 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003275 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003276 }
3277 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003278 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3279 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003280 }
3281
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003282 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 PyErr_SetString(PyExc_TypeError,
3284 "decoding str is not supported");
3285 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003286 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003287
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003288 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3289 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003291 "decoding to str: need a bytes-like object, %.80s found",
3292 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003293 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003294 }
Tim Petersced69f82003-09-16 20:30:58 +00003295
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003297 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003298 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3299 return NULL;
3300 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003301 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003303
Serhiy Storchaka05997252013-01-26 12:14:02 +02003304 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003305 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003306 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307}
3308
Victor Stinnerebe17e02016-10-12 13:57:45 +02003309/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3310 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3311 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003312int
3313_Py_normalize_encoding(const char *encoding,
3314 char *lower,
3315 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 char *l;
3319 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003320 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321
Victor Stinner942889a2016-09-05 15:40:10 -07003322 assert(encoding != NULL);
3323
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003324 e = encoding;
3325 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003326 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003327 punct = 0;
3328 while (1) {
3329 char c = *e;
3330 if (c == 0) {
3331 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
Victor Stinner942889a2016-09-05 15:40:10 -07003333
3334 if (Py_ISALNUM(c) || c == '.') {
3335 if (punct && l != lower) {
3336 if (l == l_end) {
3337 return 0;
3338 }
3339 *l++ = '_';
3340 }
3341 punct = 0;
3342
3343 if (l == l_end) {
3344 return 0;
3345 }
3346 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003347 }
3348 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003349 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003350 }
Victor Stinner942889a2016-09-05 15:40:10 -07003351
3352 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003353 }
3354 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003355 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003356}
3357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358PyObject *
3359PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003360 Py_ssize_t size,
3361 const char *encoding,
3362 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003363{
3364 PyObject *buffer = NULL, *unicode;
3365 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003366 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3367
Victor Stinner22eb6892019-06-26 00:51:05 +02003368 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3369 return NULL;
3370 }
3371
Victor Stinnered076ed2019-06-26 01:49:32 +02003372 if (size == 0) {
3373 _Py_RETURN_UNICODE_EMPTY();
3374 }
3375
Victor Stinner942889a2016-09-05 15:40:10 -07003376 if (encoding == NULL) {
3377 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3378 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003379
Fred Drakee4315f52000-05-09 19:53:39 +00003380 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003381 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3382 char *lower = buflower;
3383
3384 /* Fast paths */
3385 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3386 lower += 3;
3387 if (*lower == '_') {
3388 /* Match "utf8" and "utf_8" */
3389 lower++;
3390 }
3391
3392 if (lower[0] == '8' && lower[1] == 0) {
3393 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3394 }
3395 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3396 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3397 }
3398 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3399 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3400 }
3401 }
3402 else {
3403 if (strcmp(lower, "ascii") == 0
3404 || strcmp(lower, "us_ascii") == 0) {
3405 return PyUnicode_DecodeASCII(s, size, errors);
3406 }
Steve Dowercc16be82016-09-08 10:35:16 -07003407 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003408 else if (strcmp(lower, "mbcs") == 0) {
3409 return PyUnicode_DecodeMBCS(s, size, errors);
3410 }
3411 #endif
3412 else if (strcmp(lower, "latin1") == 0
3413 || strcmp(lower, "latin_1") == 0
3414 || strcmp(lower, "iso_8859_1") == 0
3415 || strcmp(lower, "iso8859_1") == 0) {
3416 return PyUnicode_DecodeLatin1(s, size, errors);
3417 }
3418 }
Victor Stinner37296e82010-06-10 13:36:23 +00003419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
3421 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003422 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003423 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003424 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003425 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 if (buffer == NULL)
3427 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003428 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 if (unicode == NULL)
3430 goto onError;
3431 if (!PyUnicode_Check(unicode)) {
3432 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003433 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003434 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003435 encoding,
3436 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 Py_DECREF(unicode);
3438 goto onError;
3439 }
3440 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003441 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 Py_XDECREF(buffer);
3445 return NULL;
3446}
3447
Alexander Belopolsky40018472011-02-26 01:02:56 +00003448PyObject *
3449PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003450 const char *encoding,
3451 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003452{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453 if (!PyUnicode_Check(unicode)) {
3454 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003455 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456 }
3457
Serhiy Storchaka00939072016-10-27 21:05:49 +03003458 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3459 "PyUnicode_AsDecodedObject() is deprecated; "
3460 "use PyCodec_Decode() to decode from str", 1) < 0)
3461 return NULL;
3462
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003465
3466 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003467 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003468}
3469
Alexander Belopolsky40018472011-02-26 01:02:56 +00003470PyObject *
3471PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003472 const char *encoding,
3473 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474{
3475 PyObject *v;
3476
3477 if (!PyUnicode_Check(unicode)) {
3478 PyErr_BadArgument();
3479 goto onError;
3480 }
3481
Serhiy Storchaka00939072016-10-27 21:05:49 +03003482 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3483 "PyUnicode_AsDecodedUnicode() is deprecated; "
3484 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3485 return NULL;
3486
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
3490 /* Decode via the codec registry */
3491 v = PyCodec_Decode(unicode, encoding, errors);
3492 if (v == NULL)
3493 goto onError;
3494 if (!PyUnicode_Check(v)) {
3495 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003496 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003497 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003498 encoding,
3499 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500 Py_DECREF(v);
3501 goto onError;
3502 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003503 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506 return NULL;
3507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 Py_ssize_t size,
3512 const char *encoding,
3513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514{
3515 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003516
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003517 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3521 Py_DECREF(unicode);
3522 return v;
3523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003529{
3530 PyObject *v;
3531
3532 if (!PyUnicode_Check(unicode)) {
3533 PyErr_BadArgument();
3534 goto onError;
3535 }
3536
Serhiy Storchaka00939072016-10-27 21:05:49 +03003537 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3538 "PyUnicode_AsEncodedObject() is deprecated; "
3539 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3540 "or PyCodec_Encode() for generic encoding", 1) < 0)
3541 return NULL;
3542
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003543 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003545
3546 /* Encode via the codec registry */
3547 v = PyCodec_Encode(unicode, encoding, errors);
3548 if (v == NULL)
3549 goto onError;
3550 return v;
3551
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003553 return NULL;
3554}
3555
Victor Stinner1b579672011-12-17 05:47:23 +01003556
Victor Stinner2cba6b82018-01-10 22:46:15 +01003557static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003558unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003559 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003560{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003561 Py_ssize_t wlen;
3562 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3563 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003564 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003566
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003567 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003568 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003570 return NULL;
3571 }
3572
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003573 char *str;
3574 size_t error_pos;
3575 const char *reason;
3576 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003577 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003578 PyMem_Free(wstr);
3579
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003580 if (res != 0) {
3581 if (res == -2) {
3582 PyObject *exc;
3583 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3584 "locale", unicode,
3585 (Py_ssize_t)error_pos,
3586 (Py_ssize_t)(error_pos+1),
3587 reason);
3588 if (exc != NULL) {
3589 PyCodec_StrictErrors(exc);
3590 Py_DECREF(exc);
3591 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003593 else if (res == -3) {
3594 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3595 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003596 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003597 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003598 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003599 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003600 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003602 PyObject *bytes = PyBytes_FromString(str);
3603 PyMem_RawFree(str);
3604 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605}
3606
Victor Stinnerad158722010-10-27 00:25:46 +00003607PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003608PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3609{
Victor Stinner709d23d2019-05-02 14:56:30 -04003610 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3611 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003612}
3613
3614PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003615PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003616{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003617 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003618 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003619 return unicode_encode_utf8(unicode,
3620 interp->fs_codec.error_handler,
3621 interp->fs_codec.errors);
3622 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003623#ifndef _Py_FORCE_UTF8_FS_ENCODING
3624 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003625 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003626 interp->fs_codec.encoding,
3627 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003628 }
Victor Stinnerad158722010-10-27 00:25:46 +00003629#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003630 else {
3631 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3632 machinery is not ready and so cannot be used:
3633 use wcstombs() in this case. */
3634 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3635 assert(filesystem_errors != NULL);
3636 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3637 assert(errors != _Py_ERROR_UNKNOWN);
3638#ifdef _Py_FORCE_UTF8_FS_ENCODING
3639 return unicode_encode_utf8(unicode, errors, NULL);
3640#else
3641 return unicode_encode_locale(unicode, errors, 0);
3642#endif
3643 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003644}
3645
Alexander Belopolsky40018472011-02-26 01:02:56 +00003646PyObject *
3647PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003648 const char *encoding,
3649 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650{
3651 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003652 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003653
Guido van Rossumd57fd912000-03-10 22:53:23 +00003654 if (!PyUnicode_Check(unicode)) {
3655 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 }
Fred Drakee4315f52000-05-09 19:53:39 +00003658
Victor Stinner22eb6892019-06-26 00:51:05 +02003659 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3660 return NULL;
3661 }
3662
Victor Stinner942889a2016-09-05 15:40:10 -07003663 if (encoding == NULL) {
3664 return _PyUnicode_AsUTF8String(unicode, errors);
3665 }
3666
Fred Drakee4315f52000-05-09 19:53:39 +00003667 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003668 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3669 char *lower = buflower;
3670
3671 /* Fast paths */
3672 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3673 lower += 3;
3674 if (*lower == '_') {
3675 /* Match "utf8" and "utf_8" */
3676 lower++;
3677 }
3678
3679 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003681 }
3682 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3683 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3684 }
3685 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3686 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3687 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003688 }
Victor Stinner942889a2016-09-05 15:40:10 -07003689 else {
3690 if (strcmp(lower, "ascii") == 0
3691 || strcmp(lower, "us_ascii") == 0) {
3692 return _PyUnicode_AsASCIIString(unicode, errors);
3693 }
Steve Dowercc16be82016-09-08 10:35:16 -07003694#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003695 else if (strcmp(lower, "mbcs") == 0) {
3696 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3697 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003698#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003699 else if (strcmp(lower, "latin1") == 0 ||
3700 strcmp(lower, "latin_1") == 0 ||
3701 strcmp(lower, "iso_8859_1") == 0 ||
3702 strcmp(lower, "iso8859_1") == 0) {
3703 return _PyUnicode_AsLatin1String(unicode, errors);
3704 }
3705 }
Victor Stinner37296e82010-06-10 13:36:23 +00003706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707
3708 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003709 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003711 return NULL;
3712
3713 /* The normal path */
3714 if (PyBytes_Check(v))
3715 return v;
3716
3717 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003718 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003719 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003720 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003721
3722 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003723 "encoder %s returned bytearray instead of bytes; "
3724 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003725 encoding);
3726 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003727 Py_DECREF(v);
3728 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003729 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003731 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3732 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003733 Py_DECREF(v);
3734 return b;
3735 }
3736
3737 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003738 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003739 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003740 encoding,
3741 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003742 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003743 return NULL;
3744}
3745
Alexander Belopolsky40018472011-02-26 01:02:56 +00003746PyObject *
3747PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003748 const char *encoding,
3749 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003750{
3751 PyObject *v;
3752
3753 if (!PyUnicode_Check(unicode)) {
3754 PyErr_BadArgument();
3755 goto onError;
3756 }
3757
Serhiy Storchaka00939072016-10-27 21:05:49 +03003758 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3759 "PyUnicode_AsEncodedUnicode() is deprecated; "
3760 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3761 return NULL;
3762
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003763 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003765
3766 /* Encode via the codec registry */
3767 v = PyCodec_Encode(unicode, encoding, errors);
3768 if (v == NULL)
3769 goto onError;
3770 if (!PyUnicode_Check(v)) {
3771 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003772 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003773 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003774 encoding,
3775 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003776 Py_DECREF(v);
3777 goto onError;
3778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003780
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 return NULL;
3783}
3784
Victor Stinner2cba6b82018-01-10 22:46:15 +01003785static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003786unicode_decode_locale(const char *str, Py_ssize_t len,
3787 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003788{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003789 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3790 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003791 return NULL;
3792 }
3793
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003794 wchar_t *wstr;
3795 size_t wlen;
3796 const char *reason;
3797 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003798 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003799 if (res != 0) {
3800 if (res == -2) {
3801 PyObject *exc;
3802 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3803 "locale", str, len,
3804 (Py_ssize_t)wlen,
3805 (Py_ssize_t)(wlen + 1),
3806 reason);
3807 if (exc != NULL) {
3808 PyCodec_StrictErrors(exc);
3809 Py_DECREF(exc);
3810 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003811 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003812 else if (res == -3) {
3813 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3814 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003815 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003816 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003817 }
Victor Stinner2f197072011-12-17 07:08:30 +01003818 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003819 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003820
3821 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3822 PyMem_RawFree(wstr);
3823 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003824}
3825
3826PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003827PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3828 const char *errors)
3829{
Victor Stinner709d23d2019-05-02 14:56:30 -04003830 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3831 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003832}
3833
3834PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003835PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003836{
3837 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003838 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3839 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003840}
3841
3842
3843PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003844PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003845 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003846 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3847}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848
Christian Heimes5894ba72007-11-04 11:43:14 +00003849PyObject*
3850PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3851{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003852 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003853 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003854 return unicode_decode_utf8(s, size,
3855 interp->fs_codec.error_handler,
3856 interp->fs_codec.errors,
3857 NULL);
3858 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003859#ifndef _Py_FORCE_UTF8_FS_ENCODING
3860 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003861 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003862 interp->fs_codec.encoding,
3863 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003864 }
Victor Stinnerad158722010-10-27 00:25:46 +00003865#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003866 else {
3867 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3868 machinery is not ready and so cannot be used:
3869 use mbstowcs() in this case. */
3870 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3871 assert(filesystem_errors != NULL);
3872 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3873 assert(errors != _Py_ERROR_UNKNOWN);
3874#ifdef _Py_FORCE_UTF8_FS_ENCODING
3875 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3876#else
3877 return unicode_decode_locale(s, size, errors, 0);
3878#endif
3879 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003880}
3881
Martin v. Löwis011e8422009-05-05 04:43:17 +00003882
3883int
3884PyUnicode_FSConverter(PyObject* arg, void* addr)
3885{
Brett Cannonec6ce872016-09-06 15:50:29 -07003886 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887 PyObject *output = NULL;
3888 Py_ssize_t size;
3889 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003890 if (arg == NULL) {
3891 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003892 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003893 return 1;
3894 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003895 path = PyOS_FSPath(arg);
3896 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003897 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003898 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003899 if (PyBytes_Check(path)) {
3900 output = path;
3901 }
3902 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3903 output = PyUnicode_EncodeFSDefault(path);
3904 Py_DECREF(path);
3905 if (!output) {
3906 return 0;
3907 }
3908 assert(PyBytes_Check(output));
3909 }
3910
Victor Stinner0ea2a462010-04-30 00:22:08 +00003911 size = PyBytes_GET_SIZE(output);
3912 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003913 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003914 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003915 Py_DECREF(output);
3916 return 0;
3917 }
3918 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003919 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003920}
3921
3922
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003923int
3924PyUnicode_FSDecoder(PyObject* arg, void* addr)
3925{
Brett Cannona5711202016-09-06 19:36:01 -07003926 int is_buffer = 0;
3927 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003928 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003929 if (arg == NULL) {
3930 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003931 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932 return 1;
3933 }
Brett Cannona5711202016-09-06 19:36:01 -07003934
3935 is_buffer = PyObject_CheckBuffer(arg);
3936 if (!is_buffer) {
3937 path = PyOS_FSPath(arg);
3938 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003939 return 0;
3940 }
Brett Cannona5711202016-09-06 19:36:01 -07003941 }
3942 else {
3943 path = arg;
3944 Py_INCREF(arg);
3945 }
3946
3947 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003948 output = path;
3949 }
3950 else if (PyBytes_Check(path) || is_buffer) {
3951 PyObject *path_bytes = NULL;
3952
3953 if (!PyBytes_Check(path) &&
3954 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003955 "path should be string, bytes, or os.PathLike, not %.200s",
3956 Py_TYPE(arg)->tp_name)) {
3957 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003958 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003959 }
3960 path_bytes = PyBytes_FromObject(path);
3961 Py_DECREF(path);
3962 if (!path_bytes) {
3963 return 0;
3964 }
3965 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3966 PyBytes_GET_SIZE(path_bytes));
3967 Py_DECREF(path_bytes);
3968 if (!output) {
3969 return 0;
3970 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003972 else {
3973 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003974 "path should be string, bytes, or os.PathLike, not %.200s",
3975 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003976 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003977 return 0;
3978 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003979 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003980 Py_DECREF(output);
3981 return 0;
3982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003984 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003985 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003986 Py_DECREF(output);
3987 return 0;
3988 }
3989 *(PyObject**)addr = output;
3990 return Py_CLEANUP_SUPPORTED;
3991}
3992
3993
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003994const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003996{
Christian Heimesf3863112007-11-22 07:46:41 +00003997 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003999 if (!PyUnicode_Check(unicode)) {
4000 PyErr_BadArgument();
4001 return NULL;
4002 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004003 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004004 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004006 if (PyUnicode_UTF8(unicode) == NULL) {
4007 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004008 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 if (bytes == NULL)
4010 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004011 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4012 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004013 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 Py_DECREF(bytes);
4015 return NULL;
4016 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004017 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004018 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004019 PyBytes_AS_STRING(bytes),
4020 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 Py_DECREF(bytes);
4022 }
4023
4024 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004025 *psize = PyUnicode_UTF8_LENGTH(unicode);
4026 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004027}
4028
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004029const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004031{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004032 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4033}
4034
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035Py_UNICODE *
4036PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4037{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 if (!PyUnicode_Check(unicode)) {
4039 PyErr_BadArgument();
4040 return NULL;
4041 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004042 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4043 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004045 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004046 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047
Serhiy Storchakac46db922018-10-23 22:58:24 +03004048 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4049 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4050 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004053 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4054 if (w == NULL) {
4055 PyErr_NoMemory();
4056 return NULL;
4057 }
4058 unicode_copy_as_widechar(unicode, w, wlen + 1);
4059 _PyUnicode_WSTR(unicode) = w;
4060 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4061 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004062 }
4063 }
4064 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004065 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004066 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004067}
4068
Alexander Belopolsky40018472011-02-26 01:02:56 +00004069Py_UNICODE *
4070PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073}
4074
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004075const Py_UNICODE *
4076_PyUnicode_AsUnicode(PyObject *unicode)
4077{
4078 Py_ssize_t size;
4079 const Py_UNICODE *wstr;
4080
4081 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4082 if (wstr && wcslen(wstr) != (size_t)size) {
4083 PyErr_SetString(PyExc_ValueError, "embedded null character");
4084 return NULL;
4085 }
4086 return wstr;
4087}
4088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004089
Alexander Belopolsky40018472011-02-26 01:02:56 +00004090Py_ssize_t
4091PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092{
4093 if (!PyUnicode_Check(unicode)) {
4094 PyErr_BadArgument();
4095 goto onError;
4096 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004097 if (_PyUnicode_WSTR(unicode) == NULL) {
4098 if (PyUnicode_AsUnicode(unicode) == NULL)
4099 goto onError;
4100 }
4101 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 return -1;
4105}
4106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004107Py_ssize_t
4108PyUnicode_GetLength(PyObject *unicode)
4109{
Victor Stinner07621332012-06-16 04:53:46 +02004110 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004111 PyErr_BadArgument();
4112 return -1;
4113 }
Victor Stinner07621332012-06-16 04:53:46 +02004114 if (PyUnicode_READY(unicode) == -1)
4115 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 return PyUnicode_GET_LENGTH(unicode);
4117}
4118
4119Py_UCS4
4120PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4121{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004122 void *data;
4123 int kind;
4124
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004125 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004126 PyErr_BadArgument();
4127 return (Py_UCS4)-1;
4128 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004129 if (PyUnicode_READY(unicode) == -1) {
4130 return (Py_UCS4)-1;
4131 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004132 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004133 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004134 return (Py_UCS4)-1;
4135 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004136 data = PyUnicode_DATA(unicode);
4137 kind = PyUnicode_KIND(unicode);
4138 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139}
4140
4141int
4142PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4143{
4144 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004145 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004146 return -1;
4147 }
Victor Stinner488fa492011-12-12 00:01:39 +01004148 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004149 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004150 PyErr_SetString(PyExc_IndexError, "string index out of range");
4151 return -1;
4152 }
Victor Stinner488fa492011-12-12 00:01:39 +01004153 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004154 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004155 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4156 PyErr_SetString(PyExc_ValueError, "character out of range");
4157 return -1;
4158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4160 index, ch);
4161 return 0;
4162}
4163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164const char *
4165PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004166{
Victor Stinner42cb4622010-09-01 19:39:01 +00004167 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004168}
4169
Victor Stinner554f3f02010-06-16 23:33:54 +00004170/* create or adjust a UnicodeDecodeError */
4171static void
4172make_decode_exception(PyObject **exceptionObject,
4173 const char *encoding,
4174 const char *input, Py_ssize_t length,
4175 Py_ssize_t startpos, Py_ssize_t endpos,
4176 const char *reason)
4177{
4178 if (*exceptionObject == NULL) {
4179 *exceptionObject = PyUnicodeDecodeError_Create(
4180 encoding, input, length, startpos, endpos, reason);
4181 }
4182 else {
4183 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4184 goto onError;
4185 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4186 goto onError;
4187 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4188 goto onError;
4189 }
4190 return;
4191
4192onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004193 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004194}
4195
Steve Dowercc16be82016-09-08 10:35:16 -07004196#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004197static int
4198widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4199{
4200 if (newsize > *size) {
4201 wchar_t *newbuf = *buf;
4202 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4203 PyErr_NoMemory();
4204 return -1;
4205 }
4206 *buf = newbuf;
4207 }
4208 *size = newsize;
4209 return 0;
4210}
4211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212/* error handling callback helper:
4213 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004214 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004215 and adjust various state variables.
4216 return 0 on success, -1 on error
4217*/
4218
Alexander Belopolsky40018472011-02-26 01:02:56 +00004219static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004220unicode_decode_call_errorhandler_wchar(
4221 const char *errors, PyObject **errorHandler,
4222 const char *encoding, const char *reason,
4223 const char **input, const char **inend, Py_ssize_t *startinpos,
4224 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004225 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004226{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004227 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228
4229 PyObject *restuple = NULL;
4230 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004231 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004232 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004233 Py_ssize_t requiredsize;
4234 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004235 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004236 wchar_t *repwstr;
4237 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238
4239 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004240 *errorHandler = PyCodec_LookupError(errors);
4241 if (*errorHandler == NULL)
4242 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243 }
4244
Victor Stinner554f3f02010-06-16 23:33:54 +00004245 make_decode_exception(exceptionObject,
4246 encoding,
4247 *input, *inend - *input,
4248 *startinpos, *endinpos,
4249 reason);
4250 if (*exceptionObject == NULL)
4251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004253 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004254 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004257 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004260 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004261 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004262
4263 /* Copy back the bytes variables, which might have been modified by the
4264 callback */
4265 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4266 if (!inputobj)
4267 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268 *input = PyBytes_AS_STRING(inputobj);
4269 insize = PyBytes_GET_SIZE(inputobj);
4270 *inend = *input + insize;
4271 /* we can DECREF safely, as the exception has another reference,
4272 so the object won't go away. */
4273 Py_DECREF(inputobj);
4274
4275 if (newpos<0)
4276 newpos = insize+newpos;
4277 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004278 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 goto onError;
4280 }
4281
4282 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4283 if (repwstr == NULL)
4284 goto onError;
4285 /* need more space? (at least enough for what we
4286 have+the replacement+the rest of the string (starting
4287 at the new input position), so we won't have to check space
4288 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004289 requiredsize = *outpos;
4290 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4291 goto overflow;
4292 requiredsize += repwlen;
4293 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4294 goto overflow;
4295 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004296 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004298 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004299 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004300 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004301 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004302 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004304 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 *endinpos = newpos;
4307 *inptr = *input + newpos;
4308
4309 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004310 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 return 0;
4312
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004313 overflow:
4314 PyErr_SetString(PyExc_OverflowError,
4315 "decoded result is too long for a Python string");
4316
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004317 onError:
4318 Py_XDECREF(restuple);
4319 return -1;
4320}
Steve Dowercc16be82016-09-08 10:35:16 -07004321#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322
4323static int
4324unicode_decode_call_errorhandler_writer(
4325 const char *errors, PyObject **errorHandler,
4326 const char *encoding, const char *reason,
4327 const char **input, const char **inend, Py_ssize_t *startinpos,
4328 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4329 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4330{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004331 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004332
4333 PyObject *restuple = NULL;
4334 PyObject *repunicode = NULL;
4335 Py_ssize_t insize;
4336 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004337 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004338 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004339 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004340 int need_to_grow = 0;
4341 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342
4343 if (*errorHandler == NULL) {
4344 *errorHandler = PyCodec_LookupError(errors);
4345 if (*errorHandler == NULL)
4346 goto onError;
4347 }
4348
4349 make_decode_exception(exceptionObject,
4350 encoding,
4351 *input, *inend - *input,
4352 *startinpos, *endinpos,
4353 reason);
4354 if (*exceptionObject == NULL)
4355 goto onError;
4356
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004357 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004358 if (restuple == NULL)
4359 goto onError;
4360 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004361 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004362 goto onError;
4363 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004364 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004365 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004366
4367 /* Copy back the bytes variables, which might have been modified by the
4368 callback */
4369 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4370 if (!inputobj)
4371 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004372 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004373 *input = PyBytes_AS_STRING(inputobj);
4374 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004375 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004376 /* we can DECREF safely, as the exception has another reference,
4377 so the object won't go away. */
4378 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004382 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004383 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004385 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004386
Victor Stinner170ca6f2013-04-18 00:25:28 +02004387 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004388 if (replen > 1) {
4389 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004390 need_to_grow = 1;
4391 }
4392 new_inptr = *input + newpos;
4393 if (*inend - new_inptr > remain) {
4394 /* We don't know the decoding algorithm here so we make the worst
4395 assumption that one byte decodes to one unicode character.
4396 If unfortunately one byte could decode to more unicode characters,
4397 the decoder may write out-of-bound then. Is it possible for the
4398 algorithms using this function? */
4399 writer->min_length += *inend - new_inptr - remain;
4400 need_to_grow = 1;
4401 }
4402 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004403 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004404 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004405 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4406 goto onError;
4407 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004409 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004412 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004415 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004421}
4422
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423/* --- UTF-7 Codec -------------------------------------------------------- */
4424
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425/* See RFC2152 for details. We encode conservatively and decode liberally. */
4426
4427/* Three simple macros defining base-64. */
4428
4429/* Is c a base-64 character? */
4430
4431#define IS_BASE64(c) \
4432 (((c) >= 'A' && (c) <= 'Z') || \
4433 ((c) >= 'a' && (c) <= 'z') || \
4434 ((c) >= '0' && (c) <= '9') || \
4435 (c) == '+' || (c) == '/')
4436
4437/* given that c is a base-64 character, what is its base-64 value? */
4438
4439#define FROM_BASE64(c) \
4440 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4441 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4442 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4443 (c) == '+' ? 62 : 63)
4444
4445/* What is the base-64 character of the bottom 6 bits of n? */
4446
4447#define TO_BASE64(n) \
4448 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4449
4450/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4451 * decoded as itself. We are permissive on decoding; the only ASCII
4452 * byte not decoding to itself is the + which begins a base64
4453 * string. */
4454
4455#define DECODE_DIRECT(c) \
4456 ((c) <= 127 && (c) != '+')
4457
4458/* The UTF-7 encoder treats ASCII characters differently according to
4459 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4460 * the above). See RFC2152. This array identifies these different
4461 * sets:
4462 * 0 : "Set D"
4463 * alphanumeric and '(),-./:?
4464 * 1 : "Set O"
4465 * !"#$%&*;<=>@[]^_`{|}
4466 * 2 : "whitespace"
4467 * ht nl cr sp
4468 * 3 : special (must be base64 encoded)
4469 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4470 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471
Tim Petersced69f82003-09-16 20:30:58 +00004472static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473char utf7_category[128] = {
4474/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4475 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4476/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4477 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4478/* sp ! " # $ % & ' ( ) * + , - . / */
4479 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4480/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4482/* @ A B C D E F G H I J K L M N O */
4483 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4484/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4486/* ` a b c d e f g h i j k l m n o */
4487 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4488/* p q r s t u v w x y z { | } ~ del */
4489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490};
4491
Antoine Pitrou244651a2009-05-04 18:56:13 +00004492/* ENCODE_DIRECT: this character should be encoded as itself. The
4493 * answer depends on whether we are encoding set O as itself, and also
4494 * on whether we are encoding whitespace as itself. RFC2152 makes it
4495 * clear that the answers to these questions vary between
4496 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004497
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498#define ENCODE_DIRECT(c, directO, directWS) \
4499 ((c) < 128 && (c) > 0 && \
4500 ((utf7_category[(c)] == 0) || \
4501 (directWS && (utf7_category[(c)] == 2)) || \
4502 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503
Alexander Belopolsky40018472011-02-26 01:02:56 +00004504PyObject *
4505PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004506 Py_ssize_t size,
4507 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004509 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4510}
4511
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512/* The decoder. The only state we preserve is our read position,
4513 * i.e. how many characters we have consumed. So if we end in the
4514 * middle of a shift sequence we have to back off the read position
4515 * and the output to the beginning of the sequence, otherwise we lose
4516 * all the shift state (seen bits, number of bits seen, high
4517 * surrogate). */
4518
Alexander Belopolsky40018472011-02-26 01:02:56 +00004519PyObject *
4520PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004521 Py_ssize_t size,
4522 const char *errors,
4523 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004526 Py_ssize_t startinpos;
4527 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 const char *errmsg = "";
4531 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 unsigned int base64bits = 0;
4534 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004535 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 PyObject *errorHandler = NULL;
4537 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004539 if (size == 0) {
4540 if (consumed)
4541 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004542 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004545 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004546 _PyUnicodeWriter_Init(&writer);
4547 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004548
4549 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 e = s + size;
4551
4552 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004553 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004555 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 if (inShift) { /* in a base-64 section */
4558 if (IS_BASE64(ch)) { /* consume a base-64 character */
4559 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4560 base64bits += 6;
4561 s++;
4562 if (base64bits >= 16) {
4563 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004564 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 base64bits -= 16;
4566 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004567 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 if (surrogate) {
4569 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004570 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4571 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004572 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004573 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004575 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
4577 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004578 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004579 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 }
4582 }
Victor Stinner551ac952011-11-29 22:58:13 +01004583 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 /* first surrogate */
4585 surrogate = outCh;
4586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004588 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 }
4591 }
4592 }
4593 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 if (base64bits > 0) { /* left-over bits */
4596 if (base64bits >= 6) {
4597 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004598 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 errmsg = "partial character in shift sequence";
4600 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004601 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 else {
4603 /* Some bits remain; they should be zero */
4604 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004605 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 errmsg = "non-zero padding bits in shift sequence";
4607 goto utf7Error;
4608 }
4609 }
4610 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004611 if (surrogate && DECODE_DIRECT(ch)) {
4612 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4613 goto onError;
4614 }
4615 surrogate = 0;
4616 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 /* '-' is absorbed; other terminating
4618 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004619 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621 }
4622 }
4623 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625 s++; /* consume '+' */
4626 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004627 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004628 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004629 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004631 else if (s < e && !IS_BASE64(*s)) {
4632 s++;
4633 errmsg = "ill-formed sequence";
4634 goto utf7Error;
4635 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004637 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004638 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004639 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004640 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004641 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 }
4643 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004644 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004645 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004646 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 else {
4650 startinpos = s-starts;
4651 s++;
4652 errmsg = "unexpected special character";
4653 goto utf7Error;
4654 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004656utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004657 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004658 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004659 errors, &errorHandler,
4660 "utf7", errmsg,
4661 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004662 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 }
4665
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 /* end of string */
4667
4668 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4669 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004670 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 if (surrogate ||
4672 (base64bits >= 6) ||
4673 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 errors, &errorHandler,
4677 "utf7", "unterminated shift sequence",
4678 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004679 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 goto onError;
4681 if (s < e)
4682 goto restart;
4683 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004684 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685
4686 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004689 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004690 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004691 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004692 writer.kind, writer.data, shiftOutStart);
4693 Py_XDECREF(errorHandler);
4694 Py_XDECREF(exc);
4695 _PyUnicodeWriter_Dealloc(&writer);
4696 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004697 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004698 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 }
4700 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004703 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 Py_XDECREF(errorHandler);
4706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004707 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_XDECREF(errorHandler);
4711 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713 return NULL;
4714}
4715
4716
Alexander Belopolsky40018472011-02-26 01:02:56 +00004717PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004718_PyUnicode_EncodeUTF7(PyObject *str,
4719 int base64SetO,
4720 int base64WhiteSpace,
4721 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723 int kind;
4724 void *data;
4725 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004726 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004728 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 unsigned int base64bits = 0;
4730 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731 char * out;
4732 char * start;
4733
Benjamin Petersonbac79492012-01-14 13:34:47 -05004734 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004735 return NULL;
4736 kind = PyUnicode_KIND(str);
4737 data = PyUnicode_DATA(str);
4738 len = PyUnicode_GET_LENGTH(str);
4739
4740 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004741 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004743 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004744 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004745 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004746 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 if (v == NULL)
4748 return NULL;
4749
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004751 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004752 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753
Antoine Pitrou244651a2009-05-04 18:56:13 +00004754 if (inShift) {
4755 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4756 /* shifting out */
4757 if (base64bits) { /* output remaining bits */
4758 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4759 base64buffer = 0;
4760 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004761 }
4762 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 /* Characters not in the BASE64 set implicitly unshift the sequence
4764 so no '-' is required, except if the character is itself a '-' */
4765 if (IS_BASE64(ch) || ch == '-') {
4766 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004767 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 *out++ = (char) ch;
4769 }
4770 else {
4771 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004772 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004773 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004774 else { /* not in a shift sequence */
4775 if (ch == '+') {
4776 *out++ = '+';
4777 *out++ = '-';
4778 }
4779 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4780 *out++ = (char) ch;
4781 }
4782 else {
4783 *out++ = '+';
4784 inShift = 1;
4785 goto encode_char;
4786 }
4787 }
4788 continue;
4789encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004791 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004792
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 /* code first surrogate */
4794 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004795 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 while (base64bits >= 6) {
4797 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4798 base64bits -= 6;
4799 }
4800 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004801 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004803 base64bits += 16;
4804 base64buffer = (base64buffer << 16) | ch;
4805 while (base64bits >= 6) {
4806 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4807 base64bits -= 6;
4808 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004809 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004810 if (base64bits)
4811 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4812 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004814 if (_PyBytes_Resize(&v, out - start) < 0)
4815 return NULL;
4816 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004817}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004818PyObject *
4819PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4820 Py_ssize_t size,
4821 int base64SetO,
4822 int base64WhiteSpace,
4823 const char *errors)
4824{
4825 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004826 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004827 if (tmp == NULL)
4828 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004829 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004830 base64WhiteSpace, errors);
4831 Py_DECREF(tmp);
4832 return result;
4833}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835#undef IS_BASE64
4836#undef FROM_BASE64
4837#undef TO_BASE64
4838#undef DECODE_DIRECT
4839#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841/* --- UTF-8 Codec -------------------------------------------------------- */
4842
Alexander Belopolsky40018472011-02-26 01:02:56 +00004843PyObject *
4844PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004845 Py_ssize_t size,
4846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Walter Dörwald69652032004-09-07 20:24:22 +00004848 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4849}
4850
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851#include "stringlib/asciilib.h"
4852#include "stringlib/codecs.h"
4853#include "stringlib/undef.h"
4854
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004855#include "stringlib/ucs1lib.h"
4856#include "stringlib/codecs.h"
4857#include "stringlib/undef.h"
4858
4859#include "stringlib/ucs2lib.h"
4860#include "stringlib/codecs.h"
4861#include "stringlib/undef.h"
4862
4863#include "stringlib/ucs4lib.h"
4864#include "stringlib/codecs.h"
4865#include "stringlib/undef.h"
4866
Antoine Pitrouab868312009-01-10 15:40:25 +00004867/* Mask to quickly check whether a C 'long' contains a
4868 non-ASCII, UTF8-encoded char. */
4869#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004870# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004871#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004872# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004873#else
4874# error C 'long' size should be either 4 or 8!
4875#endif
4876
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877static Py_ssize_t
4878ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004881 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004883 /*
4884 * Issue #17237: m68k is a bit different from most architectures in
4885 * that objects do not use "natural alignment" - for example, int and
4886 * long are only aligned at 2-byte boundaries. Therefore the assert()
4887 * won't work; also, tests have shown that skipping the "optimised
4888 * version" will even speed up m68k.
4889 */
4890#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004892 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4893 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 /* Fast path, see in STRINGLIB(utf8_decode) for
4895 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004896 /* Help allocation */
4897 const char *_p = p;
4898 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 while (_p < aligned_end) {
4900 unsigned long value = *(const unsigned long *) _p;
4901 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 *((unsigned long *)q) = value;
4904 _p += SIZEOF_LONG;
4905 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004906 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 p = _p;
4908 while (p < end) {
4909 if ((unsigned char)*p & 0x80)
4910 break;
4911 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004913 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004915#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004916#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 while (p < end) {
4918 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4919 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004920 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004921 /* Help allocation */
4922 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 while (_p < aligned_end) {
4924 unsigned long value = *(unsigned long *) _p;
4925 if (value & ASCII_CHAR_MASK)
4926 break;
4927 _p += SIZEOF_LONG;
4928 }
4929 p = _p;
4930 if (_p == end)
4931 break;
4932 }
4933 if ((unsigned char)*p & 0x80)
4934 break;
4935 ++p;
4936 }
4937 memcpy(dest, start, p - start);
4938 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939}
Antoine Pitrouab868312009-01-10 15:40:25 +00004940
Victor Stinner709d23d2019-05-02 14:56:30 -04004941static PyObject *
4942unicode_decode_utf8(const char *s, Py_ssize_t size,
4943 _Py_error_handler error_handler, const char *errors,
4944 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004945{
Victor Stinner785938e2011-12-11 20:09:03 +01004946 if (size == 0) {
4947 if (consumed)
4948 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004949 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004950 }
4951
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4953 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004954 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 *consumed = 1;
4956 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004957 }
4958
Inada Naoki770847a2019-06-24 12:30:24 +09004959 const char *starts = s;
4960 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004961
Inada Naoki770847a2019-06-24 12:30:24 +09004962 // fast path: try ASCII string.
4963 PyObject *u = PyUnicode_New(size, 127);
4964 if (u == NULL) {
4965 return NULL;
4966 }
4967 s += ascii_decode(s, end, PyUnicode_DATA(u));
4968 if (s == end) {
4969 return u;
4970 }
4971
4972 // Use _PyUnicodeWriter after fast path is failed.
4973 _PyUnicodeWriter writer;
4974 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4975 writer.pos = s - starts;
4976
4977 Py_ssize_t startinpos, endinpos;
4978 const char *errmsg = "";
4979 PyObject *error_handler_obj = NULL;
4980 PyObject *exc = NULL;
4981
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 while (s < end) {
4983 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004984 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004985
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004986 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 if (PyUnicode_IS_ASCII(writer.buffer))
4988 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004989 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004990 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004992 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004993 } else {
4994 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 }
4997
4998 switch (ch) {
4999 case 0:
5000 if (s == end || consumed)
5001 goto End;
5002 errmsg = "unexpected end of data";
5003 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005004 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 break;
5006 case 1:
5007 errmsg = "invalid start byte";
5008 startinpos = s - starts;
5009 endinpos = startinpos + 1;
5010 break;
5011 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005012 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5013 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5014 {
5015 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005016 goto End;
5017 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005018 /* fall through */
5019 case 3:
5020 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005021 errmsg = "invalid continuation byte";
5022 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005023 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005024 break;
5025 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005026 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005027 goto onError;
5028 continue;
5029 }
5030
Victor Stinner1d65d912015-10-05 13:43:50 +02005031 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005032 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005033
5034 switch (error_handler) {
5035 case _Py_ERROR_IGNORE:
5036 s += (endinpos - startinpos);
5037 break;
5038
5039 case _Py_ERROR_REPLACE:
5040 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5041 goto onError;
5042 s += (endinpos - startinpos);
5043 break;
5044
5045 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005046 {
5047 Py_ssize_t i;
5048
Victor Stinner1d65d912015-10-05 13:43:50 +02005049 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5050 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005052 ch = (Py_UCS4)(unsigned char)(starts[i]);
5053 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5054 ch + 0xdc00);
5055 writer.pos++;
5056 }
5057 s += (endinpos - startinpos);
5058 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005060
5061 default:
5062 if (unicode_decode_call_errorhandler_writer(
5063 errors, &error_handler_obj,
5064 "utf-8", errmsg,
5065 &starts, &end, &startinpos, &endinpos, &exc, &s,
5066 &writer))
5067 goto onError;
5068 }
Victor Stinner785938e2011-12-11 20:09:03 +01005069 }
5070
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 if (consumed)
5073 *consumed = s - starts;
5074
Victor Stinner1d65d912015-10-05 13:43:50 +02005075 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005077 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005078
5079onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005084}
5085
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086
Victor Stinner709d23d2019-05-02 14:56:30 -04005087PyObject *
5088PyUnicode_DecodeUTF8Stateful(const char *s,
5089 Py_ssize_t size,
5090 const char *errors,
5091 Py_ssize_t *consumed)
5092{
5093 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5094}
5095
5096
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005097/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5098 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005099
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005100 On success, write a pointer to a newly allocated wide character string into
5101 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5102 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 On memory allocation failure, return -1.
5105
5106 On decoding error (if surrogateescape is zero), return -2. If wlen is
5107 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5108 is not NULL, write the decoding error message into *reason. */
5109int
5110_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005111 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005113 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005114 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 wchar_t *unicode;
5116 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117
Victor Stinner3d4226a2018-08-29 22:21:32 +02005118 int surrogateescape = 0;
5119 int surrogatepass = 0;
5120 switch (errors)
5121 {
5122 case _Py_ERROR_STRICT:
5123 break;
5124 case _Py_ERROR_SURROGATEESCAPE:
5125 surrogateescape = 1;
5126 break;
5127 case _Py_ERROR_SURROGATEPASS:
5128 surrogatepass = 1;
5129 break;
5130 default:
5131 return -3;
5132 }
5133
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134 /* Note: size will always be longer than the resulting Unicode
5135 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005137 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005138 }
5139
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005140 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005141 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005143 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144
5145 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005146 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005148 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005152#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005154#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005155 if (ch > 0xFF) {
5156#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005157 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005159 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005161 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5162 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5163#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005164 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005166 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005169
5170 if (surrogateescape) {
5171 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5172 }
5173 else {
5174 /* Is it a valid three-byte code? */
5175 if (surrogatepass
5176 && (e - s) >= 3
5177 && (s[0] & 0xf0) == 0xe0
5178 && (s[1] & 0xc0) == 0x80
5179 && (s[2] & 0xc0) == 0x80)
5180 {
5181 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5182 s += 3;
5183 unicode[outpos++] = ch;
5184 }
5185 else {
5186 PyMem_RawFree(unicode );
5187 if (reason != NULL) {
5188 switch (ch) {
5189 case 0:
5190 *reason = "unexpected end of data";
5191 break;
5192 case 1:
5193 *reason = "invalid start byte";
5194 break;
5195 /* 2, 3, 4 */
5196 default:
5197 *reason = "invalid continuation byte";
5198 break;
5199 }
5200 }
5201 if (wlen != NULL) {
5202 *wlen = s - orig_s;
5203 }
5204 return -2;
5205 }
5206 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005208 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005210 if (wlen) {
5211 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005212 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005213 *wstr = unicode;
5214 return 0;
5215}
5216
Victor Stinner5f9cf232019-03-19 01:46:25 +01005217
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005219_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5220 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005221{
5222 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005223 int res = _Py_DecodeUTF8Ex(arg, arglen,
5224 &wstr, wlen,
5225 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005226 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005227 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5228 assert(res != -3);
5229 if (wlen) {
5230 *wlen = (size_t)res;
5231 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005232 return NULL;
5233 }
5234 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005235}
5236
Antoine Pitrouab868312009-01-10 15:40:25 +00005237
Victor Stinnere47e6982017-12-21 15:45:16 +01005238/* UTF-8 encoder using the surrogateescape error handler .
5239
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005240 On success, return 0 and write the newly allocated character string (use
5241 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005242
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 On encoding failure, return -2 and write the position of the invalid
5244 surrogate character into *error_pos (if error_pos is set) and the decoding
5245 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005246
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005247 On memory allocation failure, return -1. */
5248int
5249_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005250 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005251{
5252 const Py_ssize_t max_char_size = 4;
5253 Py_ssize_t len = wcslen(text);
5254
5255 assert(len >= 0);
5256
Victor Stinner3d4226a2018-08-29 22:21:32 +02005257 int surrogateescape = 0;
5258 int surrogatepass = 0;
5259 switch (errors)
5260 {
5261 case _Py_ERROR_STRICT:
5262 break;
5263 case _Py_ERROR_SURROGATEESCAPE:
5264 surrogateescape = 1;
5265 break;
5266 case _Py_ERROR_SURROGATEPASS:
5267 surrogatepass = 1;
5268 break;
5269 default:
5270 return -3;
5271 }
5272
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005273 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5274 return -1;
5275 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005276 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005277 if (raw_malloc) {
5278 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005279 }
5280 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005281 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005282 }
5283 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005284 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005285 }
5286
5287 char *p = bytes;
5288 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005289 for (i = 0; i < len; ) {
5290 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005291 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005292 i++;
5293#if Py_UNICODE_SIZE == 2
5294 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5295 && i < len
5296 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5297 {
5298 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5299 i++;
5300 }
5301#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005302
5303 if (ch < 0x80) {
5304 /* Encode ASCII */
5305 *p++ = (char) ch;
5306
5307 }
5308 else if (ch < 0x0800) {
5309 /* Encode Latin-1 */
5310 *p++ = (char)(0xc0 | (ch >> 6));
5311 *p++ = (char)(0x80 | (ch & 0x3f));
5312 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005313 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005314 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005315 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005316 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005317 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005318 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005319 if (reason != NULL) {
5320 *reason = "encoding error";
5321 }
5322 if (raw_malloc) {
5323 PyMem_RawFree(bytes);
5324 }
5325 else {
5326 PyMem_Free(bytes);
5327 }
5328 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005329 }
5330 *p++ = (char)(ch & 0xff);
5331 }
5332 else if (ch < 0x10000) {
5333 *p++ = (char)(0xe0 | (ch >> 12));
5334 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5335 *p++ = (char)(0x80 | (ch & 0x3f));
5336 }
5337 else { /* ch >= 0x10000 */
5338 assert(ch <= MAX_UNICODE);
5339 /* Encode UCS4 Unicode ordinals */
5340 *p++ = (char)(0xf0 | (ch >> 18));
5341 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5343 *p++ = (char)(0x80 | (ch & 0x3f));
5344 }
5345 }
5346 *p++ = '\0';
5347
5348 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005349 char *bytes2;
5350 if (raw_malloc) {
5351 bytes2 = PyMem_RawRealloc(bytes, final_size);
5352 }
5353 else {
5354 bytes2 = PyMem_Realloc(bytes, final_size);
5355 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005356 if (bytes2 == NULL) {
5357 if (error_pos != NULL) {
5358 *error_pos = (size_t)-1;
5359 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 if (raw_malloc) {
5361 PyMem_RawFree(bytes);
5362 }
5363 else {
5364 PyMem_Free(bytes);
5365 }
5366 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005367 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005368 *str = bytes2;
5369 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005370}
5371
5372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005373/* Primary internal function which creates utf8 encoded bytes objects.
5374
5375 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005376 and allocate exactly as much space needed at the end. Else allocate the
5377 maximum possible needed (4 result bytes per Unicode character), and return
5378 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005379*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005380static PyObject *
5381unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383{
Victor Stinner6099a032011-12-18 14:22:26 +01005384 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005385 void *data;
5386 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005388 if (!PyUnicode_Check(unicode)) {
5389 PyErr_BadArgument();
5390 return NULL;
5391 }
5392
5393 if (PyUnicode_READY(unicode) == -1)
5394 return NULL;
5395
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005396 if (PyUnicode_UTF8(unicode))
5397 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5398 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005399
5400 kind = PyUnicode_KIND(unicode);
5401 data = PyUnicode_DATA(unicode);
5402 size = PyUnicode_GET_LENGTH(unicode);
5403
Benjamin Petersonead6b532011-12-20 17:23:42 -06005404 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005405 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005406 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005407 case PyUnicode_1BYTE_KIND:
5408 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5409 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005410 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005411 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005412 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005413 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005414 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416}
5417
Alexander Belopolsky40018472011-02-26 01:02:56 +00005418PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005419_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5420{
5421 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5422}
5423
5424
5425PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5427 Py_ssize_t size,
5428 const char *errors)
5429{
5430 PyObject *v, *unicode;
5431
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005432 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005433 if (unicode == NULL)
5434 return NULL;
5435 v = _PyUnicode_AsUTF8String(unicode, errors);
5436 Py_DECREF(unicode);
5437 return v;
5438}
5439
5440PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005441PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444}
5445
Walter Dörwald41980ca2007-08-16 21:55:45 +00005446/* --- UTF-32 Codec ------------------------------------------------------- */
5447
5448PyObject *
5449PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005450 Py_ssize_t size,
5451 const char *errors,
5452 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005453{
5454 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5455}
5456
5457PyObject *
5458PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 Py_ssize_t size,
5460 const char *errors,
5461 int *byteorder,
5462 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005463{
5464 const char *starts = s;
5465 Py_ssize_t startinpos;
5466 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005467 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005468 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005469 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005470 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005471 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005472 PyObject *errorHandler = NULL;
5473 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005474
Walter Dörwald41980ca2007-08-16 21:55:45 +00005475 q = (unsigned char *)s;
5476 e = q + size;
5477
5478 if (byteorder)
5479 bo = *byteorder;
5480
5481 /* Check for BOM marks (U+FEFF) in the input and adjust current
5482 byte order setting accordingly. In native mode, the leading BOM
5483 mark is skipped, in all other modes, it is copied to the output
5484 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005486 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 if (bom == 0x0000FEFF) {
5488 bo = -1;
5489 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005491 else if (bom == 0xFFFE0000) {
5492 bo = 1;
5493 q += 4;
5494 }
5495 if (byteorder)
5496 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005497 }
5498
Victor Stinnere64322e2012-10-30 23:12:47 +01005499 if (q == e) {
5500 if (consumed)
5501 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005502 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005503 }
5504
Victor Stinnere64322e2012-10-30 23:12:47 +01005505#ifdef WORDS_BIGENDIAN
5506 le = bo < 0;
5507#else
5508 le = bo <= 0;
5509#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005511
Victor Stinner8f674cc2013-04-17 23:02:17 +02005512 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005513 writer.min_length = (e - q + 3) / 4;
5514 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005515 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005516
Victor Stinnere64322e2012-10-30 23:12:47 +01005517 while (1) {
5518 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005519 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005520
Victor Stinnere64322e2012-10-30 23:12:47 +01005521 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005522 enum PyUnicode_Kind kind = writer.kind;
5523 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005524 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005525 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005526 if (le) {
5527 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005528 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005529 if (ch > maxch)
5530 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005531 if (kind != PyUnicode_1BYTE_KIND &&
5532 Py_UNICODE_IS_SURROGATE(ch))
5533 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005534 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005535 q += 4;
5536 } while (q <= last);
5537 }
5538 else {
5539 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005540 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005541 if (ch > maxch)
5542 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 if (kind != PyUnicode_1BYTE_KIND &&
5544 Py_UNICODE_IS_SURROGATE(ch))
5545 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005546 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005547 q += 4;
5548 } while (q <= last);
5549 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005550 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005551 }
5552
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005554 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 startinpos = ((const char *)q) - starts;
5556 endinpos = startinpos + 4;
5557 }
5558 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005559 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005561 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005563 startinpos = ((const char *)q) - starts;
5564 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005566 else {
5567 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005568 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005569 goto onError;
5570 q += 4;
5571 continue;
5572 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005573 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 startinpos = ((const char *)q) - starts;
5575 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005576 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005577
5578 /* The remaining input chars are ignored if the callback
5579 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005580 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005582 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005584 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 }
5587
Walter Dörwald41980ca2007-08-16 21:55:45 +00005588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005590
Walter Dörwald41980ca2007-08-16 21:55:45 +00005591 Py_XDECREF(errorHandler);
5592 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005593 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005596 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
5599 return NULL;
5600}
5601
5602PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005603_PyUnicode_EncodeUTF32(PyObject *str,
5604 const char *errors,
5605 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005606{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005607 enum PyUnicode_Kind kind;
5608 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005609 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005610 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005611 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005612#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005613 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005614#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005615 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005616#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005617 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005618 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005619 PyObject *errorHandler = NULL;
5620 PyObject *exc = NULL;
5621 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005622
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005623 if (!PyUnicode_Check(str)) {
5624 PyErr_BadArgument();
5625 return NULL;
5626 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005627 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005628 return NULL;
5629 kind = PyUnicode_KIND(str);
5630 data = PyUnicode_DATA(str);
5631 len = PyUnicode_GET_LENGTH(str);
5632
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005633 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005634 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005635 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005636 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005637 if (v == NULL)
5638 return NULL;
5639
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005640 /* output buffer is 4-bytes aligned */
5641 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005642 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005643 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005645 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005647
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005650 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005652 else
5653 encoding = "utf-32";
5654
5655 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005656 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5657 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005658 }
5659
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005660 pos = 0;
5661 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663
5664 if (kind == PyUnicode_2BYTE_KIND) {
5665 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5666 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005668 else {
5669 assert(kind == PyUnicode_4BYTE_KIND);
5670 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5671 &out, native_ordering);
5672 }
5673 if (pos == len)
5674 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005675
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005676 rep = unicode_encode_call_errorhandler(
5677 errors, &errorHandler,
5678 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005679 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 if (!rep)
5681 goto error;
5682
5683 if (PyBytes_Check(rep)) {
5684 repsize = PyBytes_GET_SIZE(rep);
5685 if (repsize & 3) {
5686 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005687 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005688 "surrogates not allowed");
5689 goto error;
5690 }
5691 moreunits = repsize / 4;
5692 }
5693 else {
5694 assert(PyUnicode_Check(rep));
5695 if (PyUnicode_READY(rep) < 0)
5696 goto error;
5697 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5698 if (!PyUnicode_IS_ASCII(rep)) {
5699 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 "surrogates not allowed");
5702 goto error;
5703 }
5704 }
5705
5706 /* four bytes are reserved for each surrogate */
5707 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005708 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005709 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 /* integer overflow */
5711 PyErr_NoMemory();
5712 goto error;
5713 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005714 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005716 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005717 }
5718
5719 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005720 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005721 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005723 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005724 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5725 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 }
5727
5728 Py_CLEAR(rep);
5729 }
5730
5731 /* Cut back to size actually needed. This is necessary for, for example,
5732 encoding of a string containing isolated surrogates and the 'ignore'
5733 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005734 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005735 if (nsize != PyBytes_GET_SIZE(v))
5736 _PyBytes_Resize(&v, nsize);
5737 Py_XDECREF(errorHandler);
5738 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005740 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005741 error:
5742 Py_XDECREF(rep);
5743 Py_XDECREF(errorHandler);
5744 Py_XDECREF(exc);
5745 Py_XDECREF(v);
5746 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005747}
5748
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5751 Py_ssize_t size,
5752 const char *errors,
5753 int byteorder)
5754{
5755 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005756 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005757 if (tmp == NULL)
5758 return NULL;
5759 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5760 Py_DECREF(tmp);
5761 return result;
5762}
5763
5764PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005765PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005766{
Victor Stinnerb960b342011-11-20 19:12:52 +01005767 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005768}
5769
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770/* --- UTF-16 Codec ------------------------------------------------------- */
5771
Tim Peters772747b2001-08-09 22:21:55 +00005772PyObject *
5773PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 Py_ssize_t size,
5775 const char *errors,
5776 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777{
Walter Dörwald69652032004-09-07 20:24:22 +00005778 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5779}
5780
5781PyObject *
5782PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 Py_ssize_t size,
5784 const char *errors,
5785 int *byteorder,
5786 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005787{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005789 Py_ssize_t startinpos;
5790 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005791 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005792 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005793 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005794 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005795 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 PyObject *errorHandler = NULL;
5797 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005798 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799
Tim Peters772747b2001-08-09 22:21:55 +00005800 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005801 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
5803 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005804 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005806 /* Check for BOM marks (U+FEFF) in the input and adjust current
5807 byte order setting accordingly. In native mode, the leading BOM
5808 mark is skipped, in all other modes, it is copied to the output
5809 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005810 if (bo == 0 && size >= 2) {
5811 const Py_UCS4 bom = (q[1] << 8) | q[0];
5812 if (bom == 0xFEFF) {
5813 q += 2;
5814 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005816 else if (bom == 0xFFFE) {
5817 q += 2;
5818 bo = 1;
5819 }
5820 if (byteorder)
5821 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823
Antoine Pitrou63065d72012-05-15 23:48:04 +02005824 if (q == e) {
5825 if (consumed)
5826 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005827 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005828 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829
Christian Heimes743e0cd2012-10-17 23:52:17 +02005830#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005831 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005832 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005833#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005834 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005836#endif
Tim Peters772747b2001-08-09 22:21:55 +00005837
Antoine Pitrou63065d72012-05-15 23:48:04 +02005838 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005839 character count normally. Error handler will take care of
5840 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005841 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005842 writer.min_length = (e - q + 1) / 2;
5843 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005844 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005845
Antoine Pitrou63065d72012-05-15 23:48:04 +02005846 while (1) {
5847 Py_UCS4 ch = 0;
5848 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005852 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005854 native_ordering);
5855 else
5856 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005858 native_ordering);
5859 } else if (kind == PyUnicode_2BYTE_KIND) {
5860 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005862 native_ordering);
5863 } else {
5864 assert(kind == PyUnicode_4BYTE_KIND);
5865 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005867 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005868 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005869 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870
Antoine Pitrou63065d72012-05-15 23:48:04 +02005871 switch (ch)
5872 {
5873 case 0:
5874 /* remaining byte at the end? (size should be even) */
5875 if (q == e || consumed)
5876 goto End;
5877 errmsg = "truncated data";
5878 startinpos = ((const char *)q) - starts;
5879 endinpos = ((const char *)e) - starts;
5880 break;
5881 /* The remaining input chars are ignored if the callback
5882 chooses to skip the input */
5883 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005884 q -= 2;
5885 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005886 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005887 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005888 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005889 endinpos = ((const char *)e) - starts;
5890 break;
5891 case 2:
5892 errmsg = "illegal encoding";
5893 startinpos = ((const char *)q) - 2 - starts;
5894 endinpos = startinpos + 2;
5895 break;
5896 case 3:
5897 errmsg = "illegal UTF-16 surrogate";
5898 startinpos = ((const char *)q) - 4 - starts;
5899 endinpos = startinpos + 2;
5900 break;
5901 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005902 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005903 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 continue;
5905 }
5906
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005907 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005908 errors,
5909 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005910 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005911 &starts,
5912 (const char **)&e,
5913 &startinpos,
5914 &endinpos,
5915 &exc,
5916 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005917 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
5920
Antoine Pitrou63065d72012-05-15 23:48:04 +02005921End:
Walter Dörwald69652032004-09-07 20:24:22 +00005922 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005924
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 Py_XDECREF(errorHandler);
5926 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005927 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005930 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 Py_XDECREF(errorHandler);
5932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 return NULL;
5934}
5935
Tim Peters772747b2001-08-09 22:21:55 +00005936PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937_PyUnicode_EncodeUTF16(PyObject *str,
5938 const char *errors,
5939 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005941 enum PyUnicode_Kind kind;
5942 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005945 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005946 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005947#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005948 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005949#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005950 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005951#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005952 const char *encoding;
5953 Py_ssize_t nsize, pos;
5954 PyObject *errorHandler = NULL;
5955 PyObject *exc = NULL;
5956 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005957
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005958 if (!PyUnicode_Check(str)) {
5959 PyErr_BadArgument();
5960 return NULL;
5961 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005962 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 return NULL;
5964 kind = PyUnicode_KIND(str);
5965 data = PyUnicode_DATA(str);
5966 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005967
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005968 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005969 if (kind == PyUnicode_4BYTE_KIND) {
5970 const Py_UCS4 *in = (const Py_UCS4 *)data;
5971 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005972 while (in < end) {
5973 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005974 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005975 }
5976 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005977 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005978 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005980 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005981 nsize = len + pairs + (byteorder == 0);
5982 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005983 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005988 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005989 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005990 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005991 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005992 }
5993 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005994 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005995 }
Tim Peters772747b2001-08-09 22:21:55 +00005996
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005997 if (kind == PyUnicode_1BYTE_KIND) {
5998 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5999 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006000 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006001
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006002 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006004 }
6005 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006006 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006007 }
6008 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006010 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006011
6012 pos = 0;
6013 while (pos < len) {
6014 Py_ssize_t repsize, moreunits;
6015
6016 if (kind == PyUnicode_2BYTE_KIND) {
6017 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6018 &out, native_ordering);
6019 }
6020 else {
6021 assert(kind == PyUnicode_4BYTE_KIND);
6022 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6023 &out, native_ordering);
6024 }
6025 if (pos == len)
6026 break;
6027
6028 rep = unicode_encode_call_errorhandler(
6029 errors, &errorHandler,
6030 encoding, "surrogates not allowed",
6031 str, &exc, pos, pos + 1, &pos);
6032 if (!rep)
6033 goto error;
6034
6035 if (PyBytes_Check(rep)) {
6036 repsize = PyBytes_GET_SIZE(rep);
6037 if (repsize & 1) {
6038 raise_encode_exception(&exc, encoding,
6039 str, pos - 1, pos,
6040 "surrogates not allowed");
6041 goto error;
6042 }
6043 moreunits = repsize / 2;
6044 }
6045 else {
6046 assert(PyUnicode_Check(rep));
6047 if (PyUnicode_READY(rep) < 0)
6048 goto error;
6049 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6050 if (!PyUnicode_IS_ASCII(rep)) {
6051 raise_encode_exception(&exc, encoding,
6052 str, pos - 1, pos,
6053 "surrogates not allowed");
6054 goto error;
6055 }
6056 }
6057
6058 /* two bytes are reserved for each surrogate */
6059 if (moreunits > 1) {
6060 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006061 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006062 /* integer overflow */
6063 PyErr_NoMemory();
6064 goto error;
6065 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006066 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006067 goto error;
6068 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6069 }
6070
6071 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006072 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006073 out += moreunits;
6074 } else /* rep is unicode */ {
6075 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6076 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6077 &out, native_ordering);
6078 }
6079
6080 Py_CLEAR(rep);
6081 }
6082
6083 /* Cut back to size actually needed. This is necessary for, for example,
6084 encoding of a string containing isolated surrogates and the 'ignore' handler
6085 is used. */
6086 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6087 if (nsize != PyBytes_GET_SIZE(v))
6088 _PyBytes_Resize(&v, nsize);
6089 Py_XDECREF(errorHandler);
6090 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006091 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006092 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006093 error:
6094 Py_XDECREF(rep);
6095 Py_XDECREF(errorHandler);
6096 Py_XDECREF(exc);
6097 Py_XDECREF(v);
6098 return NULL;
6099#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100}
6101
Alexander Belopolsky40018472011-02-26 01:02:56 +00006102PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006103PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6104 Py_ssize_t size,
6105 const char *errors,
6106 int byteorder)
6107{
6108 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006109 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006110 if (tmp == NULL)
6111 return NULL;
6112 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6113 Py_DECREF(tmp);
6114 return result;
6115}
6116
6117PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006118PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006120 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121}
6122
6123/* --- Unicode Escape Codec ----------------------------------------------- */
6124
Fredrik Lundh06d12682001-01-24 07:59:11 +00006125static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006126
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006128_PyUnicode_DecodeUnicodeEscape(const char *s,
6129 Py_ssize_t size,
6130 const char *errors,
6131 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 PyObject *errorHandler = NULL;
6137 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006138
Eric V. Smith42454af2016-10-31 09:22:08 -04006139 // so we can remember if we've seen an invalid escape char or not
6140 *first_invalid_escape = NULL;
6141
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006143 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006144 }
6145 /* Escaped strings will always be longer than the resulting
6146 Unicode string, so we start with size here and then reduce the
6147 length after conversion to the true value.
6148 (but if the error callback returns a long replacement string
6149 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006150 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006151 writer.min_length = size;
6152 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6153 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006154 }
6155
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 end = s + size;
6157 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006158 unsigned char c = (unsigned char) *s++;
6159 Py_UCS4 ch;
6160 int count;
6161 Py_ssize_t startinpos;
6162 Py_ssize_t endinpos;
6163 const char *message;
6164
6165#define WRITE_ASCII_CHAR(ch) \
6166 do { \
6167 assert(ch <= 127); \
6168 assert(writer.pos < writer.size); \
6169 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6170 } while(0)
6171
6172#define WRITE_CHAR(ch) \
6173 do { \
6174 if (ch <= writer.maxchar) { \
6175 assert(writer.pos < writer.size); \
6176 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6177 } \
6178 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6179 goto onError; \
6180 } \
6181 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
6183 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006184 if (c != '\\') {
6185 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 continue;
6187 }
6188
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 if (s >= end) {
6192 message = "\\ at end of string";
6193 goto error;
6194 }
6195 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006196
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006198 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 case '\n': continue;
6202 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6203 case '\'': WRITE_ASCII_CHAR('\''); continue;
6204 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6205 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006206 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006207 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6208 case 't': WRITE_ASCII_CHAR('\t'); continue;
6209 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6210 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006211 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006213 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 case '0': case '1': case '2': case '3':
6218 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006220 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 ch = (ch<<3) + *s++ - '0';
6222 if (s < end && '0' <= *s && *s <= '7') {
6223 ch = (ch<<3) + *s++ - '0';
6224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 WRITE_CHAR(ch);
6227 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 /* hex escapes */
6230 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006233 message = "truncated \\xXX escape";
6234 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006239 message = "truncated \\uXXXX escape";
6240 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006243 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006245 message = "truncated \\UXXXXXXXX escape";
6246 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006248 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 ch <<= 4;
6250 if (c >= '0' && c <= '9') {
6251 ch += c - '0';
6252 }
6253 else if (c >= 'a' && c <= 'f') {
6254 ch += c - ('a' - 10);
6255 }
6256 else if (c >= 'A' && c <= 'F') {
6257 ch += c - ('A' - 10);
6258 }
6259 else {
6260 break;
6261 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006262 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006264 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 }
6266
6267 /* when we get here, ch is a 32-bit unicode character */
6268 if (ch > MAX_UNICODE) {
6269 message = "illegal Unicode character";
6270 goto error;
6271 }
6272
6273 WRITE_CHAR(ch);
6274 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006275
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006277 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006278 if (ucnhash_CAPI == NULL) {
6279 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006280 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6281 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006282 if (ucnhash_CAPI == NULL) {
6283 PyErr_SetString(
6284 PyExc_UnicodeError,
6285 "\\N escapes not supported (can't load unicodedata module)"
6286 );
6287 goto onError;
6288 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006289 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006290
6291 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006292 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 const char *start = ++s;
6294 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006295 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006296 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006297 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 namelen = s - start;
6299 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006300 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006301 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 ch = 0xffffffff; /* in case 'getcode' messes up */
6303 if (namelen <= INT_MAX &&
6304 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6305 &ch, 0)) {
6306 assert(ch <= MAX_UNICODE);
6307 WRITE_CHAR(ch);
6308 continue;
6309 }
6310 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006311 }
6312 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006313 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006314
6315 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006316 if (*first_invalid_escape == NULL) {
6317 *first_invalid_escape = s-1; /* Back up one char, since we've
6318 already incremented s. */
6319 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006320 WRITE_ASCII_CHAR('\\');
6321 WRITE_CHAR(c);
6322 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006324
6325 error:
6326 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006328 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006329 errors, &errorHandler,
6330 "unicodeescape", message,
6331 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006333 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006334 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006335 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006336
6337#undef WRITE_ASCII_CHAR
6338#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006340
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006341 Py_XDECREF(errorHandler);
6342 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006343 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006344
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006346 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 Py_XDECREF(errorHandler);
6348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349 return NULL;
6350}
6351
Eric V. Smith42454af2016-10-31 09:22:08 -04006352PyObject *
6353PyUnicode_DecodeUnicodeEscape(const char *s,
6354 Py_ssize_t size,
6355 const char *errors)
6356{
6357 const char *first_invalid_escape;
6358 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6359 &first_invalid_escape);
6360 if (result == NULL)
6361 return NULL;
6362 if (first_invalid_escape != NULL) {
6363 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6364 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006365 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006366 Py_DECREF(result);
6367 return NULL;
6368 }
6369 }
6370 return result;
6371}
6372
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006373/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374
Alexander Belopolsky40018472011-02-26 01:02:56 +00006375PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006376PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006377{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006378 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006379 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006380 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006381 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006382 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006383 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384
Ezio Melottie7f90372012-10-05 03:33:31 +03006385 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006386 escape.
6387
Ezio Melottie7f90372012-10-05 03:33:31 +03006388 For UCS1 strings it's '\xxx', 4 bytes per source character.
6389 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6390 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006391 */
6392
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 if (!PyUnicode_Check(unicode)) {
6394 PyErr_BadArgument();
6395 return NULL;
6396 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 }
Victor Stinner358af132015-10-12 22:36:57 +02006400
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006401 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 if (len == 0) {
6403 return PyBytes_FromStringAndSize(NULL, 0);
6404 }
6405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 kind = PyUnicode_KIND(unicode);
6407 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6409 bytes, and 1 byte characters 4. */
6410 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006411 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 return PyErr_NoMemory();
6413 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006414 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 if (repr == NULL) {
6416 return NULL;
6417 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418
Victor Stinner62ec3312016-09-06 17:04:34 -07006419 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006420 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006421 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006422
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 /* U+0000-U+00ff range */
6424 if (ch < 0x100) {
6425 if (ch >= ' ' && ch < 127) {
6426 if (ch != '\\') {
6427 /* Copy printable US ASCII as-is */
6428 *p++ = (char) ch;
6429 }
6430 /* Escape backslashes */
6431 else {
6432 *p++ = '\\';
6433 *p++ = '\\';
6434 }
6435 }
Victor Stinner358af132015-10-12 22:36:57 +02006436
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 /* Map special whitespace to '\t', \n', '\r' */
6438 else if (ch == '\t') {
6439 *p++ = '\\';
6440 *p++ = 't';
6441 }
6442 else if (ch == '\n') {
6443 *p++ = '\\';
6444 *p++ = 'n';
6445 }
6446 else if (ch == '\r') {
6447 *p++ = '\\';
6448 *p++ = 'r';
6449 }
6450
6451 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6452 else {
6453 *p++ = '\\';
6454 *p++ = 'x';
6455 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6456 *p++ = Py_hexdigits[ch & 0x000F];
6457 }
Tim Petersced69f82003-09-16 20:30:58 +00006458 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006459 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 *p++ = '\\';
6462 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006463 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6464 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6465 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6466 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6469 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006470
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 /* Make sure that the first two digits are zero */
6472 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006473 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 *p++ = 'U';
6475 *p++ = '0';
6476 *p++ = '0';
6477 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6478 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6479 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6480 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6481 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6482 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 assert(p - PyBytes_AS_STRING(repr) > 0);
6487 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6488 return NULL;
6489 }
6490 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491}
6492
Alexander Belopolsky40018472011-02-26 01:02:56 +00006493PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006494PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6495 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006497 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006498 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006501 }
6502
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006503 result = PyUnicode_AsUnicodeEscapeString(tmp);
6504 Py_DECREF(tmp);
6505 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506}
6507
6508/* --- Raw Unicode Escape Codec ------------------------------------------- */
6509
Alexander Belopolsky40018472011-02-26 01:02:56 +00006510PyObject *
6511PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006512 Py_ssize_t size,
6513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006516 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 PyObject *errorHandler = NULL;
6519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006520
Victor Stinner62ec3312016-09-06 17:04:34 -07006521 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006522 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 /* Escaped strings will always be longer than the resulting
6526 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 length after conversion to the true value. (But decoding error
6528 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006529 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006530 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006531 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6532 goto onError;
6533 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006534
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 end = s + size;
6536 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006537 unsigned char c = (unsigned char) *s++;
6538 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006539 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 Py_ssize_t startinpos;
6541 Py_ssize_t endinpos;
6542 const char *message;
6543
6544#define WRITE_CHAR(ch) \
6545 do { \
6546 if (ch <= writer.maxchar) { \
6547 assert(writer.pos < writer.size); \
6548 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6549 } \
6550 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6551 goto onError; \
6552 } \
6553 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006556 if (c != '\\' || s >= end) {
6557 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006559 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006560
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 c = (unsigned char) *s++;
6562 if (c == 'u') {
6563 count = 4;
6564 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 else if (c == 'U') {
6567 count = 8;
6568 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006569 }
6570 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 assert(writer.pos < writer.size);
6572 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6573 WRITE_CHAR(c);
6574 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006575 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 startinpos = s - starts - 2;
6577
6578 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6579 for (ch = 0; count && s < end; ++s, --count) {
6580 c = (unsigned char)*s;
6581 ch <<= 4;
6582 if (c >= '0' && c <= '9') {
6583 ch += c - '0';
6584 }
6585 else if (c >= 'a' && c <= 'f') {
6586 ch += c - ('a' - 10);
6587 }
6588 else if (c >= 'A' && c <= 'F') {
6589 ch += c - ('A' - 10);
6590 }
6591 else {
6592 break;
6593 }
6594 }
6595 if (!count) {
6596 if (ch <= MAX_UNICODE) {
6597 WRITE_CHAR(ch);
6598 continue;
6599 }
6600 message = "\\Uxxxxxxxx out of range";
6601 }
6602
6603 endinpos = s-starts;
6604 writer.min_length = end - s + writer.pos;
6605 if (unicode_decode_call_errorhandler_writer(
6606 errors, &errorHandler,
6607 "rawunicodeescape", message,
6608 &starts, &end, &startinpos, &endinpos, &exc, &s,
6609 &writer)) {
6610 goto onError;
6611 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006612 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006613
6614#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 Py_XDECREF(errorHandler);
6617 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006618 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006619
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006621 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 Py_XDECREF(errorHandler);
6623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006625
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626}
6627
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006630PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006634 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006635 int kind;
6636 void *data;
6637 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006639 if (!PyUnicode_Check(unicode)) {
6640 PyErr_BadArgument();
6641 return NULL;
6642 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 kind = PyUnicode_KIND(unicode);
6647 data = PyUnicode_DATA(unicode);
6648 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 if (kind == PyUnicode_1BYTE_KIND) {
6650 return PyBytes_FromStringAndSize(data, len);
6651 }
Victor Stinner0e368262011-11-10 20:12:49 +01006652
Victor Stinner62ec3312016-09-06 17:04:34 -07006653 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6654 bytes, and 1 byte characters 4. */
6655 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006656
Victor Stinner62ec3312016-09-06 17:04:34 -07006657 if (len > PY_SSIZE_T_MAX / expandsize) {
6658 return PyErr_NoMemory();
6659 }
6660 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6661 if (repr == NULL) {
6662 return NULL;
6663 }
6664 if (len == 0) {
6665 return repr;
6666 }
6667
6668 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006669 for (pos = 0; pos < len; pos++) {
6670 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006671
Victor Stinner62ec3312016-09-06 17:04:34 -07006672 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6673 if (ch < 0x100) {
6674 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006675 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006676 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006677 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 *p++ = '\\';
6679 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006680 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6681 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6682 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6683 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6686 else {
6687 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6688 *p++ = '\\';
6689 *p++ = 'U';
6690 *p++ = '0';
6691 *p++ = '0';
6692 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6693 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6694 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6695 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6696 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6697 *p++ = Py_hexdigits[ch & 15];
6698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006700
Victor Stinner62ec3312016-09-06 17:04:34 -07006701 assert(p > PyBytes_AS_STRING(repr));
6702 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6703 return NULL;
6704 }
6705 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706}
6707
Alexander Belopolsky40018472011-02-26 01:02:56 +00006708PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006709PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6710 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006712 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006713 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006715 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006716 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6717 Py_DECREF(tmp);
6718 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719}
6720
6721/* --- Latin-1 Codec ------------------------------------------------------ */
6722
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006725 Py_ssize_t size,
6726 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006729 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733static void
6734make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006735 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006736 PyObject *unicode,
6737 Py_ssize_t startpos, Py_ssize_t endpos,
6738 const char *reason)
6739{
6740 if (*exceptionObject == NULL) {
6741 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006742 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006743 encoding, unicode, startpos, endpos, reason);
6744 }
6745 else {
6746 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6747 goto onError;
6748 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6749 goto onError;
6750 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6751 goto onError;
6752 return;
6753 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006754 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006755 }
6756}
6757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759static void
6760raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006761 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006762 PyObject *unicode,
6763 Py_ssize_t startpos, Py_ssize_t endpos,
6764 const char *reason)
6765{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006766 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006767 encoding, unicode, startpos, endpos, reason);
6768 if (*exceptionObject != NULL)
6769 PyCodec_StrictErrors(*exceptionObject);
6770}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771
6772/* error handling callback helper:
6773 build arguments, call the callback and check the arguments,
6774 put the result into newpos and return the replacement string, which
6775 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006776static PyObject *
6777unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006778 PyObject **errorHandler,
6779 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006781 Py_ssize_t startpos, Py_ssize_t endpos,
6782 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006784 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 PyObject *restuple;
6787 PyObject *resunicode;
6788
6789 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006790 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793 }
6794
Benjamin Petersonbac79492012-01-14 13:34:47 -05006795 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006796 return NULL;
6797 len = PyUnicode_GET_LENGTH(unicode);
6798
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006799 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006800 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006801 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006804 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006807 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006808 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 Py_DECREF(restuple);
6810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006811 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006812 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 &resunicode, newpos)) {
6814 Py_DECREF(restuple);
6815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006817 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6818 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6819 Py_DECREF(restuple);
6820 return NULL;
6821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006823 *newpos = len + *newpos;
6824 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006825 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 Py_DECREF(restuple);
6827 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006828 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829 Py_INCREF(resunicode);
6830 Py_DECREF(restuple);
6831 return resunicode;
6832}
6833
Alexander Belopolsky40018472011-02-26 01:02:56 +00006834static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006835unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006836 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006837 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006839 /* input state */
6840 Py_ssize_t pos=0, size;
6841 int kind;
6842 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843 /* pointer into the output */
6844 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006845 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6846 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006847 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006849 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006850 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006851 /* output object */
6852 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853
Benjamin Petersonbac79492012-01-14 13:34:47 -05006854 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006855 return NULL;
6856 size = PyUnicode_GET_LENGTH(unicode);
6857 kind = PyUnicode_KIND(unicode);
6858 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 /* allocate enough for a simple encoding without
6860 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006861 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006862 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006863
6864 _PyBytesWriter_Init(&writer);
6865 str = _PyBytesWriter_Alloc(&writer, size);
6866 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006867 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006870 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006873 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006875 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006877 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006879 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006882 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006884
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006885 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006887
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006888 /* Only overallocate the buffer if it's not the last write */
6889 writer.overallocate = (collend < size);
6890
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006892 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006893 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006894
6895 switch (error_handler) {
6896 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006897 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006899
6900 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006901 memset(str, '?', collend - collstart);
6902 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006903 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006904 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006905 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 break;
Victor Stinner50149202015-09-22 00:26:54 +02006907
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006908 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006909 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006910 writer.min_size -= (collend - collstart);
6911 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006912 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006913 if (str == NULL)
6914 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006915 pos = collend;
6916 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006917
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006918 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006919 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006920 writer.min_size -= (collend - collstart);
6921 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006922 unicode, collstart, collend);
6923 if (str == NULL)
6924 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006925 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 break;
Victor Stinner50149202015-09-22 00:26:54 +02006927
Victor Stinnerc3713e92015-09-29 12:32:13 +02006928 case _Py_ERROR_SURROGATEESCAPE:
6929 for (i = collstart; i < collend; ++i) {
6930 ch = PyUnicode_READ(kind, data, i);
6931 if (ch < 0xdc80 || 0xdcff < ch) {
6932 /* Not a UTF-8b surrogate */
6933 break;
6934 }
6935 *str++ = (char)(ch - 0xdc00);
6936 ++pos;
6937 }
6938 if (i >= collend)
6939 break;
6940 collstart = pos;
6941 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006942 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006943
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006945 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6946 encoding, reason, unicode, &exc,
6947 collstart, collend, &newpos);
6948 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006950
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006951 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006952 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006953
Victor Stinner6bd525b2015-10-09 13:10:05 +02006954 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006955 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006956 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006957 PyBytes_AS_STRING(rep),
6958 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006959 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006960 else {
6961 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006962
Victor Stinner6bd525b2015-10-09 13:10:05 +02006963 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006964 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006965
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006966 if (limit == 256 ?
6967 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6968 !PyUnicode_IS_ASCII(rep))
6969 {
6970 /* Not all characters are smaller than limit */
6971 raise_encode_exception(&exc, encoding, unicode,
6972 collstart, collend, reason);
6973 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006975 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6976 str = _PyBytesWriter_WriteBytes(&writer, str,
6977 PyUnicode_DATA(rep),
6978 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006980 if (str == NULL)
6981 goto onError;
6982
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006983 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006984 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006985 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006986
6987 /* If overallocation was disabled, ensure that it was the last
6988 write. Otherwise, we missed an optimization */
6989 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990 }
6991 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006992
Victor Stinner50149202015-09-22 00:26:54 +02006993 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006995 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006996
6997 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006998 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006999 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007000 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007001 Py_XDECREF(exc);
7002 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003}
7004
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007005/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007006PyObject *
7007PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007008 Py_ssize_t size,
7009 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007011 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007012 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007013 if (unicode == NULL)
7014 return NULL;
7015 result = unicode_encode_ucs1(unicode, errors, 256);
7016 Py_DECREF(unicode);
7017 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018}
7019
Alexander Belopolsky40018472011-02-26 01:02:56 +00007020PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007021_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022{
7023 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 PyErr_BadArgument();
7025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007027 if (PyUnicode_READY(unicode) == -1)
7028 return NULL;
7029 /* Fast path: if it is a one-byte string, construct
7030 bytes object directly. */
7031 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7032 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7033 PyUnicode_GET_LENGTH(unicode));
7034 /* Non-Latin-1 characters present. Defer to above function to
7035 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007036 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007037}
7038
7039PyObject*
7040PyUnicode_AsLatin1String(PyObject *unicode)
7041{
7042 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043}
7044
7045/* --- 7-bit ASCII Codec -------------------------------------------------- */
7046
Alexander Belopolsky40018472011-02-26 01:02:56 +00007047PyObject *
7048PyUnicode_DecodeASCII(const char *s,
7049 Py_ssize_t size,
7050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007053 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007054 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007056 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007057
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007059 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007060
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007062 if (size == 1 && (unsigned char)s[0] < 128)
7063 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007064
Inada Naoki770847a2019-06-24 12:30:24 +09007065 // Shortcut for simple case
7066 PyObject *u = PyUnicode_New(size, 127);
7067 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007068 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007069 }
7070 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7071 if (outpos == size) {
7072 return u;
7073 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007074
Inada Naoki770847a2019-06-24 12:30:24 +09007075 _PyUnicodeWriter writer;
7076 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007077 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007078
Inada Naoki770847a2019-06-24 12:30:24 +09007079 s += outpos;
7080 int kind = writer.kind;
7081 void *data = writer.data;
7082 Py_ssize_t startinpos, endinpos;
7083
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007084 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007085 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007087 PyUnicode_WRITE(kind, data, writer.pos, c);
7088 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007090 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007092
7093 /* byte outsize range 0x00..0x7f: call the error handler */
7094
7095 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007096 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007097
7098 switch (error_handler)
7099 {
7100 case _Py_ERROR_REPLACE:
7101 case _Py_ERROR_SURROGATEESCAPE:
7102 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007103 but we may switch to UCS2 at the first write */
7104 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7105 goto onError;
7106 kind = writer.kind;
7107 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007108
7109 if (error_handler == _Py_ERROR_REPLACE)
7110 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7111 else
7112 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7113 writer.pos++;
7114 ++s;
7115 break;
7116
7117 case _Py_ERROR_IGNORE:
7118 ++s;
7119 break;
7120
7121 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 startinpos = s-starts;
7123 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007124 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007125 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 "ascii", "ordinal not in range(128)",
7127 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007128 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007130 kind = writer.kind;
7131 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007134 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007136 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007137
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007139 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007140 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007141 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return NULL;
7143}
7144
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007145/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007146PyObject *
7147PyUnicode_EncodeASCII(const Py_UNICODE *p,
7148 Py_ssize_t size,
7149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007151 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007152 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007153 if (unicode == NULL)
7154 return NULL;
7155 result = unicode_encode_ucs1(unicode, errors, 128);
7156 Py_DECREF(unicode);
7157 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158}
7159
Alexander Belopolsky40018472011-02-26 01:02:56 +00007160PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007161_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162{
7163 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 PyErr_BadArgument();
7165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007167 if (PyUnicode_READY(unicode) == -1)
7168 return NULL;
7169 /* Fast path: if it is an ASCII-only string, construct bytes object
7170 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007171 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7173 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007174 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007175}
7176
7177PyObject *
7178PyUnicode_AsASCIIString(PyObject *unicode)
7179{
7180 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007181}
7182
Steve Dowercc16be82016-09-08 10:35:16 -07007183#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007184
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007185/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007186
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007187#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188#define NEED_RETRY
7189#endif
7190
Steve Dower7ebdda02019-08-21 16:22:33 -07007191/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7192 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7193 both cases also and avoids partial characters overrunning the
7194 length limit in MultiByteToWideChar on Windows */
7195#define DECODING_CHUNK_SIZE (INT_MAX/4)
7196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197#ifndef WC_ERR_INVALID_CHARS
7198# define WC_ERR_INVALID_CHARS 0x0080
7199#endif
7200
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007201static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007202code_page_name(UINT code_page, PyObject **obj)
7203{
7204 *obj = NULL;
7205 if (code_page == CP_ACP)
7206 return "mbcs";
7207 if (code_page == CP_UTF7)
7208 return "CP_UTF7";
7209 if (code_page == CP_UTF8)
7210 return "CP_UTF8";
7211
7212 *obj = PyBytes_FromFormat("cp%u", code_page);
7213 if (*obj == NULL)
7214 return NULL;
7215 return PyBytes_AS_STRING(*obj);
7216}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
Victor Stinner3a50e702011-10-18 21:21:00 +02007218static DWORD
7219decode_code_page_flags(UINT code_page)
7220{
7221 if (code_page == CP_UTF7) {
7222 /* The CP_UTF7 decoder only supports flags=0 */
7223 return 0;
7224 }
7225 else
7226 return MB_ERR_INVALID_CHARS;
7227}
7228
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007229/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 * Decode a byte string from a Windows code page into unicode object in strict
7231 * mode.
7232 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007233 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7234 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007235 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007236static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007237decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007238 wchar_t **buf,
7239 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 const char *in,
7241 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007243 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007244 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246
7247 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007249 while ((outsize = MultiByteToWideChar(code_page, flags,
7250 in, insize, NULL, 0)) <= 0)
7251 {
7252 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7253 goto error;
7254 }
7255 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7256 flags = 0;
7257 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007259 /* Extend a wchar_t* buffer */
7260 Py_ssize_t n = *bufsize; /* Get the current length */
7261 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7262 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007264 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265
7266 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7268 if (outsize <= 0)
7269 goto error;
7270 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007271
Victor Stinner3a50e702011-10-18 21:21:00 +02007272error:
7273 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7274 return -2;
7275 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007276 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007277}
7278
Victor Stinner3a50e702011-10-18 21:21:00 +02007279/*
7280 * Decode a byte string from a code page into unicode object with an error
7281 * handler.
7282 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007283 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 * UnicodeDecodeError exception and returns -1 on error.
7285 */
7286static int
7287decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007288 wchar_t **buf,
7289 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007290 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007291 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007292{
7293 const char *startin = in;
7294 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007295 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 /* Ideally, we should get reason from FormatMessage. This is the Windows
7297 2000 English version of the message. */
7298 const char *reason = "No mapping for the Unicode character exists "
7299 "in the target code page.";
7300 /* each step cannot decode more than 1 character, but a character can be
7301 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007302 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007303 int insize;
7304 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 PyObject *errorHandler = NULL;
7306 PyObject *exc = NULL;
7307 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007308 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 DWORD err;
7310 int ret = -1;
7311
7312 assert(size > 0);
7313
7314 encoding = code_page_name(code_page, &encoding_obj);
7315 if (encoding == NULL)
7316 return -1;
7317
Victor Stinner7d00cc12014-03-17 23:08:06 +01007318 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7320 UnicodeDecodeError. */
7321 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7322 if (exc != NULL) {
7323 PyCodec_StrictErrors(exc);
7324 Py_CLEAR(exc);
7325 }
7326 goto error;
7327 }
7328
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007329 /* Extend a wchar_t* buffer */
7330 Py_ssize_t n = *bufsize; /* Get the current length */
7331 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7332 PyErr_NoMemory();
7333 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007335 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7336 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007338 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007339
7340 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 while (in < endin)
7342 {
7343 /* Decode a character */
7344 insize = 1;
7345 do
7346 {
7347 outsize = MultiByteToWideChar(code_page, flags,
7348 in, insize,
7349 buffer, Py_ARRAY_LENGTH(buffer));
7350 if (outsize > 0)
7351 break;
7352 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007353 if (err == ERROR_INVALID_FLAGS && flags) {
7354 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7355 flags = 0;
7356 continue;
7357 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007358 if (err != ERROR_NO_UNICODE_TRANSLATION
7359 && err != ERROR_INSUFFICIENT_BUFFER)
7360 {
7361 PyErr_SetFromWindowsErr(0);
7362 goto error;
7363 }
7364 insize++;
7365 }
7366 /* 4=maximum length of a UTF-8 sequence */
7367 while (insize <= 4 && (in + insize) <= endin);
7368
7369 if (outsize <= 0) {
7370 Py_ssize_t startinpos, endinpos, outpos;
7371
Victor Stinner7d00cc12014-03-17 23:08:06 +01007372 /* last character in partial decode? */
7373 if (in + insize >= endin && !final)
7374 break;
7375
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 startinpos = in - startin;
7377 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007378 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007379 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 errors, &errorHandler,
7381 encoding, reason,
7382 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 {
7385 goto error;
7386 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007387 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 }
7389 else {
7390 in += insize;
7391 memcpy(out, buffer, outsize * sizeof(wchar_t));
7392 out += outsize;
7393 }
7394 }
7395
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 /* Shrink the buffer */
7397 assert(out - *buf <= *bufsize);
7398 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007399 /* (in - startin) <= size and size is an int */
7400 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007401
7402error:
7403 Py_XDECREF(encoding_obj);
7404 Py_XDECREF(errorHandler);
7405 Py_XDECREF(exc);
7406 return ret;
7407}
7408
Victor Stinner3a50e702011-10-18 21:21:00 +02007409static PyObject *
7410decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007411 const char *s, Py_ssize_t size,
7412 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007413{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007414 wchar_t *buf = NULL;
7415 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007416 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 if (code_page < 0) {
7419 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7420 return NULL;
7421 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007422 if (size < 0) {
7423 PyErr_BadInternalCall();
7424 return NULL;
7425 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007426
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007427 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007429
Victor Stinner76a31a62011-11-04 00:05:13 +01007430 do
7431 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007433 if (size > DECODING_CHUNK_SIZE) {
7434 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007435 final = 0;
7436 done = 0;
7437 }
7438 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007439#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007440 {
7441 chunk_size = (int)size;
7442 final = (consumed == NULL);
7443 done = 1;
7444 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007445
Victor Stinner76a31a62011-11-04 00:05:13 +01007446 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007447 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007448 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007449 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007450 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007451
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007452 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 s, chunk_size);
7454 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007455 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007456 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007457 errors, final);
7458 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007459
7460 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007461 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 return NULL;
7463 }
7464
7465 if (consumed)
7466 *consumed += converted;
7467
7468 s += converted;
7469 size -= converted;
7470 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007471
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007472 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7473 PyMem_Free(buf);
7474 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475}
7476
Alexander Belopolsky40018472011-02-26 01:02:56 +00007477PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007478PyUnicode_DecodeCodePageStateful(int code_page,
7479 const char *s,
7480 Py_ssize_t size,
7481 const char *errors,
7482 Py_ssize_t *consumed)
7483{
7484 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7485}
7486
7487PyObject *
7488PyUnicode_DecodeMBCSStateful(const char *s,
7489 Py_ssize_t size,
7490 const char *errors,
7491 Py_ssize_t *consumed)
7492{
7493 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7494}
7495
7496PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007497PyUnicode_DecodeMBCS(const char *s,
7498 Py_ssize_t size,
7499 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007500{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007501 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7502}
7503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504static DWORD
7505encode_code_page_flags(UINT code_page, const char *errors)
7506{
7507 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007508 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007509 }
7510 else if (code_page == CP_UTF7) {
7511 /* CP_UTF7 only supports flags=0 */
7512 return 0;
7513 }
7514 else {
7515 if (errors != NULL && strcmp(errors, "replace") == 0)
7516 return 0;
7517 else
7518 return WC_NO_BEST_FIT_CHARS;
7519 }
7520}
7521
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007523 * Encode a Unicode string to a Windows code page into a byte string in strict
7524 * mode.
7525 *
7526 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007527 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007528 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007529static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007530encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007533{
Victor Stinner554f3f02010-06-16 23:33:54 +00007534 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 BOOL *pusedDefaultChar = &usedDefaultChar;
7536 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007537 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007538 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 const DWORD flags = encode_code_page_flags(code_page, NULL);
7540 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 /* Create a substring so that we can get the UTF-16 representation
7542 of just the slice under consideration. */
7543 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544
Martin v. Löwis3d325192011-11-04 18:23:06 +01007545 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007546
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007548 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007550 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007551
Victor Stinner2fc507f2011-11-04 20:06:39 +01007552 substring = PyUnicode_Substring(unicode, offset, offset+len);
7553 if (substring == NULL)
7554 return -1;
7555 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7556 if (p == NULL) {
7557 Py_DECREF(substring);
7558 return -1;
7559 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007560 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007561
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007562 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007564 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 NULL, 0,
7566 NULL, pusedDefaultChar);
7567 if (outsize <= 0)
7568 goto error;
7569 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007570 if (pusedDefaultChar && *pusedDefaultChar) {
7571 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007573 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007574
Victor Stinner3a50e702011-10-18 21:21:00 +02007575 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007578 if (*outbytes == NULL) {
7579 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007583 }
7584 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 const Py_ssize_t n = PyBytes_Size(*outbytes);
7587 if (outsize > PY_SSIZE_T_MAX - n) {
7588 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007589 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007592 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7593 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007595 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007597 }
7598
7599 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007601 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 out, outsize,
7603 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007604 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 if (outsize <= 0)
7606 goto error;
7607 if (pusedDefaultChar && *pusedDefaultChar)
7608 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007609 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007610
Victor Stinner3a50e702011-10-18 21:21:00 +02007611error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007612 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7614 return -2;
7615 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007616 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007617}
7618
Victor Stinner3a50e702011-10-18 21:21:00 +02007619/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007620 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 * error handler.
7622 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007623 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 * -1 on other error.
7625 */
7626static int
7627encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007628 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007630{
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007632 Py_ssize_t pos = unicode_offset;
7633 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 /* Ideally, we should get reason from FormatMessage. This is the Windows
7635 2000 English version of the message. */
7636 const char *reason = "invalid character";
7637 /* 4=maximum length of a UTF-8 sequence */
7638 char buffer[4];
7639 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7640 Py_ssize_t outsize;
7641 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 PyObject *errorHandler = NULL;
7643 PyObject *exc = NULL;
7644 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007645 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 PyObject *rep;
7648 int ret = -1;
7649
7650 assert(insize > 0);
7651
7652 encoding = code_page_name(code_page, &encoding_obj);
7653 if (encoding == NULL)
7654 return -1;
7655
7656 if (errors == NULL || strcmp(errors, "strict") == 0) {
7657 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7658 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007659 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 if (exc != NULL) {
7661 PyCodec_StrictErrors(exc);
7662 Py_DECREF(exc);
7663 }
7664 Py_XDECREF(encoding_obj);
7665 return -1;
7666 }
7667
7668 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7669 pusedDefaultChar = &usedDefaultChar;
7670 else
7671 pusedDefaultChar = NULL;
7672
7673 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7674 PyErr_NoMemory();
7675 goto error;
7676 }
7677 outsize = insize * Py_ARRAY_LENGTH(buffer);
7678
7679 if (*outbytes == NULL) {
7680 /* Create string object */
7681 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7682 if (*outbytes == NULL)
7683 goto error;
7684 out = PyBytes_AS_STRING(*outbytes);
7685 }
7686 else {
7687 /* Extend string object */
7688 Py_ssize_t n = PyBytes_Size(*outbytes);
7689 if (n > PY_SSIZE_T_MAX - outsize) {
7690 PyErr_NoMemory();
7691 goto error;
7692 }
7693 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7694 goto error;
7695 out = PyBytes_AS_STRING(*outbytes) + n;
7696 }
7697
7698 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007699 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007700 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007701 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7702 wchar_t chars[2];
7703 int charsize;
7704 if (ch < 0x10000) {
7705 chars[0] = (wchar_t)ch;
7706 charsize = 1;
7707 }
7708 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007709 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7710 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007711 charsize = 2;
7712 }
7713
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007715 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007716 buffer, Py_ARRAY_LENGTH(buffer),
7717 NULL, pusedDefaultChar);
7718 if (outsize > 0) {
7719 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7720 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007721 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007722 memcpy(out, buffer, outsize);
7723 out += outsize;
7724 continue;
7725 }
7726 }
7727 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7728 PyErr_SetFromWindowsErr(0);
7729 goto error;
7730 }
7731
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 rep = unicode_encode_call_errorhandler(
7733 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007735 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 if (rep == NULL)
7737 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007738 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007739
7740 if (PyBytes_Check(rep)) {
7741 outsize = PyBytes_GET_SIZE(rep);
7742 if (outsize != 1) {
7743 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7744 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7745 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7746 Py_DECREF(rep);
7747 goto error;
7748 }
7749 out = PyBytes_AS_STRING(*outbytes) + offset;
7750 }
7751 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7752 out += outsize;
7753 }
7754 else {
7755 Py_ssize_t i;
7756 enum PyUnicode_Kind kind;
7757 void *data;
7758
Benjamin Petersonbac79492012-01-14 13:34:47 -05007759 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 Py_DECREF(rep);
7761 goto error;
7762 }
7763
7764 outsize = PyUnicode_GET_LENGTH(rep);
7765 if (outsize != 1) {
7766 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7767 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7768 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7769 Py_DECREF(rep);
7770 goto error;
7771 }
7772 out = PyBytes_AS_STRING(*outbytes) + offset;
7773 }
7774 kind = PyUnicode_KIND(rep);
7775 data = PyUnicode_DATA(rep);
7776 for (i=0; i < outsize; i++) {
7777 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7778 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007779 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007780 encoding, unicode,
7781 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007782 "unable to encode error handler result to ASCII");
7783 Py_DECREF(rep);
7784 goto error;
7785 }
7786 *out = (unsigned char)ch;
7787 out++;
7788 }
7789 }
7790 Py_DECREF(rep);
7791 }
7792 /* write a NUL byte */
7793 *out = 0;
7794 outsize = out - PyBytes_AS_STRING(*outbytes);
7795 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7796 if (_PyBytes_Resize(outbytes, outsize) < 0)
7797 goto error;
7798 ret = 0;
7799
7800error:
7801 Py_XDECREF(encoding_obj);
7802 Py_XDECREF(errorHandler);
7803 Py_XDECREF(exc);
7804 return ret;
7805}
7806
Victor Stinner3a50e702011-10-18 21:21:00 +02007807static PyObject *
7808encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007810 const char *errors)
7811{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007812 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007813 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007814 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007815 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007816
Victor Stinner29dacf22015-01-26 16:41:32 +01007817 if (!PyUnicode_Check(unicode)) {
7818 PyErr_BadArgument();
7819 return NULL;
7820 }
7821
Benjamin Petersonbac79492012-01-14 13:34:47 -05007822 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007823 return NULL;
7824 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007825
Victor Stinner3a50e702011-10-18 21:21:00 +02007826 if (code_page < 0) {
7827 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7828 return NULL;
7829 }
7830
Martin v. Löwis3d325192011-11-04 18:23:06 +01007831 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007832 return PyBytes_FromStringAndSize(NULL, 0);
7833
Victor Stinner7581cef2011-11-03 22:32:33 +01007834 offset = 0;
7835 do
7836 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007837#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007838 if (len > DECODING_CHUNK_SIZE) {
7839 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007840 done = 0;
7841 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007842 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007843#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007844 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007845 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007846 done = 1;
7847 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007848
Victor Stinner76a31a62011-11-04 00:05:13 +01007849 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007850 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007851 errors);
7852 if (ret == -2)
7853 ret = encode_code_page_errors(code_page, &outbytes,
7854 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007855 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007856 if (ret < 0) {
7857 Py_XDECREF(outbytes);
7858 return NULL;
7859 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007860
Victor Stinner7581cef2011-11-03 22:32:33 +01007861 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007862 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007863 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007864
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 return outbytes;
7866}
7867
7868PyObject *
7869PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7870 Py_ssize_t size,
7871 const char *errors)
7872{
Victor Stinner7581cef2011-11-03 22:32:33 +01007873 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007874 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007875 if (unicode == NULL)
7876 return NULL;
7877 res = encode_code_page(CP_ACP, unicode, errors);
7878 Py_DECREF(unicode);
7879 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007880}
7881
7882PyObject *
7883PyUnicode_EncodeCodePage(int code_page,
7884 PyObject *unicode,
7885 const char *errors)
7886{
Victor Stinner7581cef2011-11-03 22:32:33 +01007887 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007888}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007889
Alexander Belopolsky40018472011-02-26 01:02:56 +00007890PyObject *
7891PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007892{
Victor Stinner7581cef2011-11-03 22:32:33 +01007893 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007894}
7895
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007896#undef NEED_RETRY
7897
Steve Dowercc16be82016-09-08 10:35:16 -07007898#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900/* --- Character Mapping Codec -------------------------------------------- */
7901
Victor Stinnerfb161b12013-04-18 01:44:27 +02007902static int
7903charmap_decode_string(const char *s,
7904 Py_ssize_t size,
7905 PyObject *mapping,
7906 const char *errors,
7907 _PyUnicodeWriter *writer)
7908{
7909 const char *starts = s;
7910 const char *e;
7911 Py_ssize_t startinpos, endinpos;
7912 PyObject *errorHandler = NULL, *exc = NULL;
7913 Py_ssize_t maplen;
7914 enum PyUnicode_Kind mapkind;
7915 void *mapdata;
7916 Py_UCS4 x;
7917 unsigned char ch;
7918
7919 if (PyUnicode_READY(mapping) == -1)
7920 return -1;
7921
7922 maplen = PyUnicode_GET_LENGTH(mapping);
7923 mapdata = PyUnicode_DATA(mapping);
7924 mapkind = PyUnicode_KIND(mapping);
7925
7926 e = s + size;
7927
7928 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7929 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7930 * is disabled in encoding aliases, latin1 is preferred because
7931 * its implementation is faster. */
7932 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7933 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7934 Py_UCS4 maxchar = writer->maxchar;
7935
7936 assert (writer->kind == PyUnicode_1BYTE_KIND);
7937 while (s < e) {
7938 ch = *s;
7939 x = mapdata_ucs1[ch];
7940 if (x > maxchar) {
7941 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7942 goto onError;
7943 maxchar = writer->maxchar;
7944 outdata = (Py_UCS1 *)writer->data;
7945 }
7946 outdata[writer->pos] = x;
7947 writer->pos++;
7948 ++s;
7949 }
7950 return 0;
7951 }
7952
7953 while (s < e) {
7954 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7955 enum PyUnicode_Kind outkind = writer->kind;
7956 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7957 if (outkind == PyUnicode_1BYTE_KIND) {
7958 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7959 Py_UCS4 maxchar = writer->maxchar;
7960 while (s < e) {
7961 ch = *s;
7962 x = mapdata_ucs2[ch];
7963 if (x > maxchar)
7964 goto Error;
7965 outdata[writer->pos] = x;
7966 writer->pos++;
7967 ++s;
7968 }
7969 break;
7970 }
7971 else if (outkind == PyUnicode_2BYTE_KIND) {
7972 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7973 while (s < e) {
7974 ch = *s;
7975 x = mapdata_ucs2[ch];
7976 if (x == 0xFFFE)
7977 goto Error;
7978 outdata[writer->pos] = x;
7979 writer->pos++;
7980 ++s;
7981 }
7982 break;
7983 }
7984 }
7985 ch = *s;
7986
7987 if (ch < maplen)
7988 x = PyUnicode_READ(mapkind, mapdata, ch);
7989 else
7990 x = 0xfffe; /* invalid value */
7991Error:
7992 if (x == 0xfffe)
7993 {
7994 /* undefined mapping */
7995 startinpos = s-starts;
7996 endinpos = startinpos+1;
7997 if (unicode_decode_call_errorhandler_writer(
7998 errors, &errorHandler,
7999 "charmap", "character maps to <undefined>",
8000 &starts, &e, &startinpos, &endinpos, &exc, &s,
8001 writer)) {
8002 goto onError;
8003 }
8004 continue;
8005 }
8006
8007 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8008 goto onError;
8009 ++s;
8010 }
8011 Py_XDECREF(errorHandler);
8012 Py_XDECREF(exc);
8013 return 0;
8014
8015onError:
8016 Py_XDECREF(errorHandler);
8017 Py_XDECREF(exc);
8018 return -1;
8019}
8020
8021static int
8022charmap_decode_mapping(const char *s,
8023 Py_ssize_t size,
8024 PyObject *mapping,
8025 const char *errors,
8026 _PyUnicodeWriter *writer)
8027{
8028 const char *starts = s;
8029 const char *e;
8030 Py_ssize_t startinpos, endinpos;
8031 PyObject *errorHandler = NULL, *exc = NULL;
8032 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008033 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008034
8035 e = s + size;
8036
8037 while (s < e) {
8038 ch = *s;
8039
8040 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8041 key = PyLong_FromLong((long)ch);
8042 if (key == NULL)
8043 goto onError;
8044
8045 item = PyObject_GetItem(mapping, key);
8046 Py_DECREF(key);
8047 if (item == NULL) {
8048 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8049 /* No mapping found means: mapping is undefined. */
8050 PyErr_Clear();
8051 goto Undefined;
8052 } else
8053 goto onError;
8054 }
8055
8056 /* Apply mapping */
8057 if (item == Py_None)
8058 goto Undefined;
8059 if (PyLong_Check(item)) {
8060 long value = PyLong_AS_LONG(item);
8061 if (value == 0xFFFE)
8062 goto Undefined;
8063 if (value < 0 || value > MAX_UNICODE) {
8064 PyErr_Format(PyExc_TypeError,
8065 "character mapping must be in range(0x%lx)",
8066 (unsigned long)MAX_UNICODE + 1);
8067 goto onError;
8068 }
8069
8070 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8071 goto onError;
8072 }
8073 else if (PyUnicode_Check(item)) {
8074 if (PyUnicode_READY(item) == -1)
8075 goto onError;
8076 if (PyUnicode_GET_LENGTH(item) == 1) {
8077 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8078 if (value == 0xFFFE)
8079 goto Undefined;
8080 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8081 goto onError;
8082 }
8083 else {
8084 writer->overallocate = 1;
8085 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8086 goto onError;
8087 }
8088 }
8089 else {
8090 /* wrong return value */
8091 PyErr_SetString(PyExc_TypeError,
8092 "character mapping must return integer, None or str");
8093 goto onError;
8094 }
8095 Py_CLEAR(item);
8096 ++s;
8097 continue;
8098
8099Undefined:
8100 /* undefined mapping */
8101 Py_CLEAR(item);
8102 startinpos = s-starts;
8103 endinpos = startinpos+1;
8104 if (unicode_decode_call_errorhandler_writer(
8105 errors, &errorHandler,
8106 "charmap", "character maps to <undefined>",
8107 &starts, &e, &startinpos, &endinpos, &exc, &s,
8108 writer)) {
8109 goto onError;
8110 }
8111 }
8112 Py_XDECREF(errorHandler);
8113 Py_XDECREF(exc);
8114 return 0;
8115
8116onError:
8117 Py_XDECREF(item);
8118 Py_XDECREF(errorHandler);
8119 Py_XDECREF(exc);
8120 return -1;
8121}
8122
Alexander Belopolsky40018472011-02-26 01:02:56 +00008123PyObject *
8124PyUnicode_DecodeCharmap(const char *s,
8125 Py_ssize_t size,
8126 PyObject *mapping,
8127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008129 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008130
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 /* Default to Latin-1 */
8132 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008136 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008137 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008138 writer.min_length = size;
8139 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008141
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008142 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008143 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8144 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008145 }
8146 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008147 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8148 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008150 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008151
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008153 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 return NULL;
8155}
8156
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157/* Charmap encoding: the lookup table */
8158
Alexander Belopolsky40018472011-02-26 01:02:56 +00008159struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 PyObject_HEAD
8161 unsigned char level1[32];
8162 int count2, count3;
8163 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008164};
8165
8166static PyObject*
8167encoding_map_size(PyObject *obj, PyObject* args)
8168{
8169 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172}
8173
8174static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 PyDoc_STR("Return the size (in bytes) of this object") },
8177 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178};
8179
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 "EncodingMap", /*tp_name*/
8183 sizeof(struct encoding_map), /*tp_basicsize*/
8184 0, /*tp_itemsize*/
8185 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008186 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008187 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 0, /*tp_getattr*/
8189 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008190 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 0, /*tp_repr*/
8192 0, /*tp_as_number*/
8193 0, /*tp_as_sequence*/
8194 0, /*tp_as_mapping*/
8195 0, /*tp_hash*/
8196 0, /*tp_call*/
8197 0, /*tp_str*/
8198 0, /*tp_getattro*/
8199 0, /*tp_setattro*/
8200 0, /*tp_as_buffer*/
8201 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8202 0, /*tp_doc*/
8203 0, /*tp_traverse*/
8204 0, /*tp_clear*/
8205 0, /*tp_richcompare*/
8206 0, /*tp_weaklistoffset*/
8207 0, /*tp_iter*/
8208 0, /*tp_iternext*/
8209 encoding_map_methods, /*tp_methods*/
8210 0, /*tp_members*/
8211 0, /*tp_getset*/
8212 0, /*tp_base*/
8213 0, /*tp_dict*/
8214 0, /*tp_descr_get*/
8215 0, /*tp_descr_set*/
8216 0, /*tp_dictoffset*/
8217 0, /*tp_init*/
8218 0, /*tp_alloc*/
8219 0, /*tp_new*/
8220 0, /*tp_free*/
8221 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222};
8223
8224PyObject*
8225PyUnicode_BuildEncodingMap(PyObject* string)
8226{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227 PyObject *result;
8228 struct encoding_map *mresult;
8229 int i;
8230 int need_dict = 0;
8231 unsigned char level1[32];
8232 unsigned char level2[512];
8233 unsigned char *mlevel1, *mlevel2, *mlevel3;
8234 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 int kind;
8236 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008237 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008240 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008241 PyErr_BadArgument();
8242 return NULL;
8243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 kind = PyUnicode_KIND(string);
8245 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008246 length = PyUnicode_GET_LENGTH(string);
8247 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248 memset(level1, 0xFF, sizeof level1);
8249 memset(level2, 0xFF, sizeof level2);
8250
8251 /* If there isn't a one-to-one mapping of NULL to \0,
8252 or if there are non-BMP characters, we need to use
8253 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008255 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008256 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 ch = PyUnicode_READ(kind, data, i);
8259 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260 need_dict = 1;
8261 break;
8262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 /* unmapped character */
8265 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 l1 = ch >> 11;
8267 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268 if (level1[l1] == 0xFF)
8269 level1[l1] = count2++;
8270 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272 }
8273
8274 if (count2 >= 0xFF || count3 >= 0xFF)
8275 need_dict = 1;
8276
8277 if (need_dict) {
8278 PyObject *result = PyDict_New();
8279 PyObject *key, *value;
8280 if (!result)
8281 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008282 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008284 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285 if (!key || !value)
8286 goto failed1;
8287 if (PyDict_SetItem(result, key, value) == -1)
8288 goto failed1;
8289 Py_DECREF(key);
8290 Py_DECREF(value);
8291 }
8292 return result;
8293 failed1:
8294 Py_XDECREF(key);
8295 Py_XDECREF(value);
8296 Py_DECREF(result);
8297 return NULL;
8298 }
8299
8300 /* Create a three-level trie */
8301 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8302 16*count2 + 128*count3 - 1);
8303 if (!result)
8304 return PyErr_NoMemory();
8305 PyObject_Init(result, &EncodingMapType);
8306 mresult = (struct encoding_map*)result;
8307 mresult->count2 = count2;
8308 mresult->count3 = count3;
8309 mlevel1 = mresult->level1;
8310 mlevel2 = mresult->level23;
8311 mlevel3 = mresult->level23 + 16*count2;
8312 memcpy(mlevel1, level1, 32);
8313 memset(mlevel2, 0xFF, 16*count2);
8314 memset(mlevel3, 0, 128*count3);
8315 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008316 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008318 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8319 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 /* unmapped character */
8321 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008322 o1 = ch>>11;
8323 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008324 i2 = 16*mlevel1[o1] + o2;
8325 if (mlevel2[i2] == 0xFF)
8326 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008327 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 i3 = 128*mlevel2[i2] + o3;
8329 mlevel3[i3] = i;
8330 }
8331 return result;
8332}
8333
8334static int
Victor Stinner22168992011-11-20 17:09:18 +01008335encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336{
8337 struct encoding_map *map = (struct encoding_map*)mapping;
8338 int l1 = c>>11;
8339 int l2 = (c>>7) & 0xF;
8340 int l3 = c & 0x7F;
8341 int i;
8342
Victor Stinner22168992011-11-20 17:09:18 +01008343 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345 if (c == 0)
8346 return 0;
8347 /* level 1*/
8348 i = map->level1[l1];
8349 if (i == 0xFF) {
8350 return -1;
8351 }
8352 /* level 2*/
8353 i = map->level23[16*i+l2];
8354 if (i == 0xFF) {
8355 return -1;
8356 }
8357 /* level 3 */
8358 i = map->level23[16*map->count2 + 128*i + l3];
8359 if (i == 0) {
8360 return -1;
8361 }
8362 return i;
8363}
8364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365/* Lookup the character ch in the mapping. If the character
8366 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008367 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008369charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008370{
Christian Heimes217cfd12007-12-02 14:31:20 +00008371 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 PyObject *x;
8373
8374 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 x = PyObject_GetItem(mapping, w);
8377 Py_DECREF(w);
8378 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8380 /* No mapping found means: mapping is undefined. */
8381 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008382 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 } else
8384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008386 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008388 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 long value = PyLong_AS_LONG(x);
8390 if (value < 0 || value > 255) {
8391 PyErr_SetString(PyExc_TypeError,
8392 "character mapping must be in range(256)");
8393 Py_DECREF(x);
8394 return NULL;
8395 }
8396 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008398 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 /* wrong return value */
8402 PyErr_Format(PyExc_TypeError,
8403 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008404 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 Py_DECREF(x);
8406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 }
8408}
8409
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008410static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008411charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008412{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8414 /* exponentially overallocate to minimize reallocations */
8415 if (requiredsize < 2*outsize)
8416 requiredsize = 2*outsize;
8417 if (_PyBytes_Resize(outobj, requiredsize))
8418 return -1;
8419 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008420}
8421
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008424} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008426 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 space is available. Return a new reference to the object that
8428 was put in the output buffer, or Py_None, if the mapping was undefined
8429 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008430 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008431static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008432charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008435 PyObject *rep;
8436 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008437 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438
Christian Heimes90aa7642007-12-19 02:45:37 +00008439 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008440 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008442 if (res == -1)
8443 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 if (outsize<requiredsize)
8445 if (charmapencode_resize(outobj, outpos, requiredsize))
8446 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008447 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 outstart[(*outpos)++] = (char)res;
8449 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008450 }
8451
8452 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 Py_DECREF(rep);
8457 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008458 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 if (PyLong_Check(rep)) {
8460 Py_ssize_t requiredsize = *outpos+1;
8461 if (outsize<requiredsize)
8462 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8463 Py_DECREF(rep);
8464 return enc_EXCEPTION;
8465 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008466 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 else {
8470 const char *repchars = PyBytes_AS_STRING(rep);
8471 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8472 Py_ssize_t requiredsize = *outpos+repsize;
8473 if (outsize<requiredsize)
8474 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8475 Py_DECREF(rep);
8476 return enc_EXCEPTION;
8477 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008478 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 memcpy(outstart + *outpos, repchars, repsize);
8480 *outpos += repsize;
8481 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008483 Py_DECREF(rep);
8484 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485}
8486
8487/* handle an error in PyUnicode_EncodeCharmap
8488 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008489static int
8490charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008493 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008494 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495{
8496 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008498 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008499 enum PyUnicode_Kind kind;
8500 void *data;
8501 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008503 Py_ssize_t collstartpos = *inpos;
8504 Py_ssize_t collendpos = *inpos+1;
8505 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008506 const char *encoding = "charmap";
8507 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008508 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008509 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008510 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511
Benjamin Petersonbac79492012-01-14 13:34:47 -05008512 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 return -1;
8514 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 /* find all unencodable characters */
8516 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008517 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008518 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008519 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008520 val = encoding_map_lookup(ch, mapping);
8521 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 break;
8523 ++collendpos;
8524 continue;
8525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008526
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008527 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8528 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 if (rep==NULL)
8530 return -1;
8531 else if (rep!=Py_None) {
8532 Py_DECREF(rep);
8533 break;
8534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008535 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
8538 /* cache callback name lookup
8539 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008540 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008541 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008542
8543 switch (*error_handler) {
8544 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008545 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008546 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008547
8548 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008549 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 x = charmapencode_output('?', mapping, res, respos);
8551 if (x==enc_EXCEPTION) {
8552 return -1;
8553 }
8554 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008555 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return -1;
8557 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008558 }
8559 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008560 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 *inpos = collendpos;
8562 break;
Victor Stinner50149202015-09-22 00:26:54 +02008563
8564 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008565 /* generate replacement (temporarily (mis)uses p) */
8566 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 char buffer[2+29+1+1];
8568 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 for (cp = buffer; *cp; ++cp) {
8571 x = charmapencode_output(*cp, mapping, res, respos);
8572 if (x==enc_EXCEPTION)
8573 return -1;
8574 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008575 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 return -1;
8577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 }
8579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008580 *inpos = collendpos;
8581 break;
Victor Stinner50149202015-09-22 00:26:54 +02008582
Benjamin Peterson14339b62009-01-31 16:36:08 +00008583 default:
Victor Stinner50149202015-09-22 00:26:54 +02008584 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008585 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008589 if (PyBytes_Check(repunicode)) {
8590 /* Directly copy bytes result to output. */
8591 Py_ssize_t outsize = PyBytes_Size(*res);
8592 Py_ssize_t requiredsize;
8593 repsize = PyBytes_Size(repunicode);
8594 requiredsize = *respos + repsize;
8595 if (requiredsize > outsize)
8596 /* Make room for all additional bytes. */
8597 if (charmapencode_resize(res, respos, requiredsize)) {
8598 Py_DECREF(repunicode);
8599 return -1;
8600 }
8601 memcpy(PyBytes_AsString(*res) + *respos,
8602 PyBytes_AsString(repunicode), repsize);
8603 *respos += repsize;
8604 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008605 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008606 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008608 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008609 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008610 Py_DECREF(repunicode);
8611 return -1;
8612 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008613 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008614 data = PyUnicode_DATA(repunicode);
8615 kind = PyUnicode_KIND(repunicode);
8616 for (index = 0; index < repsize; index++) {
8617 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8618 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008620 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return -1;
8622 }
8623 else if (x==enc_FAILED) {
8624 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008625 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 return -1;
8627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008628 }
8629 *inpos = newpos;
8630 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 }
8632 return 0;
8633}
8634
Alexander Belopolsky40018472011-02-26 01:02:56 +00008635PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008636_PyUnicode_EncodeCharmap(PyObject *unicode,
8637 PyObject *mapping,
8638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 /* output object */
8641 PyObject *res = NULL;
8642 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008643 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008644 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008646 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008647 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008649 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008650 void *data;
8651 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652
Benjamin Petersonbac79492012-01-14 13:34:47 -05008653 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008654 return NULL;
8655 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008656 data = PyUnicode_DATA(unicode);
8657 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008658
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 /* Default to Latin-1 */
8660 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008661 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663 /* allocate enough for a simple encoding without
8664 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008665 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 if (res == NULL)
8667 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008668 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008672 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008674 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 if (x==enc_EXCEPTION) /* error */
8676 goto onError;
8677 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008678 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008680 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 &res, &respos)) {
8682 goto onError;
8683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 else
8686 /* done with this character => adjust input position */
8687 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008691 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008692 if (_PyBytes_Resize(&res, respos) < 0)
8693 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008696 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 return res;
8698
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 Py_XDECREF(res);
8701 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008702 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 return NULL;
8704}
8705
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008706/* Deprecated */
8707PyObject *
8708PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8709 Py_ssize_t size,
8710 PyObject *mapping,
8711 const char *errors)
8712{
8713 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008714 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008715 if (unicode == NULL)
8716 return NULL;
8717 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8718 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008719 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008720}
8721
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722PyObject *
8723PyUnicode_AsCharmapString(PyObject *unicode,
8724 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725{
8726 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 PyErr_BadArgument();
8728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008730 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731}
8732
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008734static void
8735make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008737 Py_ssize_t startpos, Py_ssize_t endpos,
8738 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008740 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 *exceptionObject = _PyUnicodeTranslateError_Create(
8742 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743 }
8744 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8746 goto onError;
8747 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8748 goto onError;
8749 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8750 goto onError;
8751 return;
8752 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008753 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 }
8755}
8756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757/* error handling callback helper:
8758 build arguments, call the callback and check the arguments,
8759 put the result into newpos and return the replacement string, which
8760 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static PyObject *
8762unicode_translate_call_errorhandler(const char *errors,
8763 PyObject **errorHandler,
8764 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008766 Py_ssize_t startpos, Py_ssize_t endpos,
8767 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008769 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008770
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008771 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 PyObject *restuple;
8773 PyObject *resunicode;
8774
8775 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779 }
8780
8781 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008784 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008786 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008787 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008788 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008790 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 Py_DECREF(restuple);
8792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008794 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 &resunicode, &i_newpos)) {
8796 Py_DECREF(restuple);
8797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008799 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008801 else
8802 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008804 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 Py_DECREF(restuple);
8806 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008807 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008808 Py_INCREF(resunicode);
8809 Py_DECREF(restuple);
8810 return resunicode;
8811}
8812
8813/* Lookup the character ch in the mapping and put the result in result,
8814 which must be decrefed by the caller.
8815 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008816static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818{
Christian Heimes217cfd12007-12-02 14:31:20 +00008819 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008820 PyObject *x;
8821
8822 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008824 x = PyObject_GetItem(mapping, w);
8825 Py_DECREF(w);
8826 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8828 /* No mapping found means: use 1:1 mapping. */
8829 PyErr_Clear();
8830 *result = NULL;
8831 return 0;
8832 } else
8833 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008834 }
8835 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 *result = x;
8837 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008838 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008841 if (value < 0 || value > MAX_UNICODE) {
8842 PyErr_Format(PyExc_ValueError,
8843 "character mapping must be in range(0x%x)",
8844 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 Py_DECREF(x);
8846 return -1;
8847 }
8848 *result = x;
8849 return 0;
8850 }
8851 else if (PyUnicode_Check(x)) {
8852 *result = x;
8853 return 0;
8854 }
8855 else {
8856 /* wrong return value */
8857 PyErr_SetString(PyExc_TypeError,
8858 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008859 Py_DECREF(x);
8860 return -1;
8861 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008862}
Victor Stinner1194ea02014-04-04 19:37:40 +02008863
8864/* lookup the character, write the result into the writer.
8865 Return 1 if the result was written into the writer, return 0 if the mapping
8866 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008867static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008868charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8869 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870{
Victor Stinner1194ea02014-04-04 19:37:40 +02008871 PyObject *item;
8872
8873 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008875
8876 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008878 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008881 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008882 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008883
8884 if (item == Py_None) {
8885 Py_DECREF(item);
8886 return 0;
8887 }
8888
8889 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008890 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8891 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8892 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008893 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8894 Py_DECREF(item);
8895 return -1;
8896 }
8897 Py_DECREF(item);
8898 return 1;
8899 }
8900
8901 if (!PyUnicode_Check(item)) {
8902 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008904 }
8905
8906 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8907 Py_DECREF(item);
8908 return -1;
8909 }
8910
8911 Py_DECREF(item);
8912 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913}
8914
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915static int
8916unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8917 Py_UCS1 *translate)
8918{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008919 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008920 int ret = 0;
8921
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922 if (charmaptranslate_lookup(ch, mapping, &item)) {
8923 return -1;
8924 }
8925
8926 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008927 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008928 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008930 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 /* not found => default to 1:1 mapping */
8932 translate[ch] = ch;
8933 return 1;
8934 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008935 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008936 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008937 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8938 used it */
8939 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 /* invalid character or character outside ASCII:
8941 skip the fast translate */
8942 goto exit;
8943 }
8944 translate[ch] = (Py_UCS1)replace;
8945 }
8946 else if (PyUnicode_Check(item)) {
8947 Py_UCS4 replace;
8948
8949 if (PyUnicode_READY(item) == -1) {
8950 Py_DECREF(item);
8951 return -1;
8952 }
8953 if (PyUnicode_GET_LENGTH(item) != 1)
8954 goto exit;
8955
8956 replace = PyUnicode_READ_CHAR(item, 0);
8957 if (replace > 127)
8958 goto exit;
8959 translate[ch] = (Py_UCS1)replace;
8960 }
8961 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008962 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008963 goto exit;
8964 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008965 ret = 1;
8966
Benjamin Peterson1365de72014-04-07 20:15:41 -04008967 exit:
8968 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008969 return ret;
8970}
8971
8972/* Fast path for ascii => ascii translation. Return 1 if the whole string
8973 was translated into writer, return 0 if the input string was partially
8974 translated into writer, raise an exception and return -1 on error. */
8975static int
8976unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008977 _PyUnicodeWriter *writer, int ignore,
8978 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008979{
Victor Stinner872b2912014-04-05 14:27:07 +02008980 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008981 Py_ssize_t len;
8982 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008983 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008984
Victor Stinner89a76ab2014-04-05 11:44:04 +02008985 len = PyUnicode_GET_LENGTH(input);
8986
Victor Stinner872b2912014-04-05 14:27:07 +02008987 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008988
8989 in = PyUnicode_1BYTE_DATA(input);
8990 end = in + len;
8991
8992 assert(PyUnicode_IS_ASCII(writer->buffer));
8993 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8994 out = PyUnicode_1BYTE_DATA(writer->buffer);
8995
Victor Stinner872b2912014-04-05 14:27:07 +02008996 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008998 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008999 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009000 int translate = unicode_fast_translate_lookup(mapping, ch,
9001 ascii_table);
9002 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009003 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009004 if (translate == 0)
9005 goto exit;
9006 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009007 }
Victor Stinner872b2912014-04-05 14:27:07 +02009008 if (ch2 == 0xfe) {
9009 if (ignore)
9010 continue;
9011 goto exit;
9012 }
9013 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009014 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009015 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 }
Victor Stinner872b2912014-04-05 14:27:07 +02009017 res = 1;
9018
9019exit:
9020 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009021 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009022 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009023}
9024
Victor Stinner3222da22015-10-01 22:07:32 +02009025static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026_PyUnicode_TranslateCharmap(PyObject *input,
9027 PyObject *mapping,
9028 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009031 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 Py_ssize_t size, i;
9033 int kind;
9034 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009035 _PyUnicodeWriter writer;
9036 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009037 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009038 PyObject *errorHandler = NULL;
9039 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009041 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 PyErr_BadArgument();
9045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 if (PyUnicode_READY(input) == -1)
9049 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009050 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 kind = PyUnicode_KIND(input);
9052 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009054 if (size == 0)
9055 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009057 /* allocate enough for a simple 1:1 translation without
9058 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 _PyUnicodeWriter_Init(&writer);
9060 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062
Victor Stinner872b2912014-04-05 14:27:07 +02009063 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9064
Victor Stinner33798672016-03-01 21:59:58 +01009065 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009066 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009067 if (PyUnicode_IS_ASCII(input)) {
9068 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9069 if (res < 0) {
9070 _PyUnicodeWriter_Dealloc(&writer);
9071 return NULL;
9072 }
9073 if (res == 1)
9074 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075 }
Victor Stinner33798672016-03-01 21:59:58 +01009076 else {
9077 i = 0;
9078 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009082 int translate;
9083 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9084 Py_ssize_t newpos;
9085 /* startpos for collecting untranslatable chars */
9086 Py_ssize_t collstart;
9087 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009088 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
Victor Stinner1194ea02014-04-04 19:37:40 +02009090 ch = PyUnicode_READ(kind, data, i);
9091 translate = charmaptranslate_output(ch, mapping, &writer);
9092 if (translate < 0)
9093 goto onError;
9094
9095 if (translate != 0) {
9096 /* it worked => adjust input pointer */
9097 ++i;
9098 continue;
9099 }
9100
9101 /* untranslatable character */
9102 collstart = i;
9103 collend = i+1;
9104
9105 /* find all untranslatable characters */
9106 while (collend < size) {
9107 PyObject *x;
9108 ch = PyUnicode_READ(kind, data, collend);
9109 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009110 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009111 Py_XDECREF(x);
9112 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009114 ++collend;
9115 }
9116
9117 if (ignore) {
9118 i = collend;
9119 }
9120 else {
9121 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9122 reason, input, &exc,
9123 collstart, collend, &newpos);
9124 if (repunicode == NULL)
9125 goto onError;
9126 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009128 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009129 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009130 Py_DECREF(repunicode);
9131 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009132 }
9133 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009134 Py_XDECREF(exc);
9135 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009136 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009139 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009140 Py_XDECREF(exc);
9141 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 return NULL;
9143}
9144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145/* Deprecated. Use PyUnicode_Translate instead. */
9146PyObject *
9147PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9148 Py_ssize_t size,
9149 PyObject *mapping,
9150 const char *errors)
9151{
Christian Heimes5f520f42012-09-11 14:03:25 +02009152 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009153 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 if (!unicode)
9155 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009156 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9157 Py_DECREF(unicode);
9158 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159}
9160
Alexander Belopolsky40018472011-02-26 01:02:56 +00009161PyObject *
9162PyUnicode_Translate(PyObject *str,
9163 PyObject *mapping,
9164 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009166 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009167 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009168 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169}
Tim Petersced69f82003-09-16 20:30:58 +00009170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171PyObject *
9172_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9173{
9174 if (!PyUnicode_Check(unicode)) {
9175 PyErr_BadInternalCall();
9176 return NULL;
9177 }
9178 if (PyUnicode_READY(unicode) == -1)
9179 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009180 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 /* If the string is already ASCII, just return the same string */
9182 Py_INCREF(unicode);
9183 return unicode;
9184 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009185
9186 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9187 PyObject *result = PyUnicode_New(len, 127);
9188 if (result == NULL) {
9189 return NULL;
9190 }
9191
9192 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9193 int kind = PyUnicode_KIND(unicode);
9194 const void *data = PyUnicode_DATA(unicode);
9195 Py_ssize_t i;
9196 for (i = 0; i < len; ++i) {
9197 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9198 if (ch < 127) {
9199 out[i] = ch;
9200 }
9201 else if (Py_UNICODE_ISSPACE(ch)) {
9202 out[i] = ' ';
9203 }
9204 else {
9205 int decimal = Py_UNICODE_TODECIMAL(ch);
9206 if (decimal < 0) {
9207 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009208 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009209 _PyUnicode_LENGTH(result) = i + 1;
9210 break;
9211 }
9212 out[i] = '0' + decimal;
9213 }
9214 }
9215
INADA Naoki16dfca42018-07-14 12:06:43 +09009216 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009217 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218}
9219
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009220PyObject *
9221PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9222 Py_ssize_t length)
9223{
Victor Stinnerf0124502011-11-21 23:12:56 +01009224 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009225 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009226 Py_UCS4 maxchar;
9227 enum PyUnicode_Kind kind;
9228 void *data;
9229
Victor Stinner99d7ad02012-02-22 13:37:39 +01009230 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009231 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009232 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009233 if (ch > 127) {
9234 int decimal = Py_UNICODE_TODECIMAL(ch);
9235 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009236 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009237 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009238 }
9239 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009240
9241 /* Copy to a new string */
9242 decimal = PyUnicode_New(length, maxchar);
9243 if (decimal == NULL)
9244 return decimal;
9245 kind = PyUnicode_KIND(decimal);
9246 data = PyUnicode_DATA(decimal);
9247 /* Iterate over code points */
9248 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009249 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009250 if (ch > 127) {
9251 int decimal = Py_UNICODE_TODECIMAL(ch);
9252 if (decimal >= 0)
9253 ch = '0' + decimal;
9254 }
9255 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009257 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009258}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009259/* --- Decimal Encoder ---------------------------------------------------- */
9260
Alexander Belopolsky40018472011-02-26 01:02:56 +00009261int
9262PyUnicode_EncodeDecimal(Py_UNICODE *s,
9263 Py_ssize_t length,
9264 char *output,
9265 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009266{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009267 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009268 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009269 enum PyUnicode_Kind kind;
9270 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009271
9272 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 PyErr_BadArgument();
9274 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009275 }
9276
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009277 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009278 if (unicode == NULL)
9279 return -1;
9280
Victor Stinner42bf7752011-11-21 22:52:58 +01009281 kind = PyUnicode_KIND(unicode);
9282 data = PyUnicode_DATA(unicode);
9283
Victor Stinnerb84d7232011-11-22 01:50:07 +01009284 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009285 PyObject *exc;
9286 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009287 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009288 Py_ssize_t startpos;
9289
9290 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009291
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009293 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009294 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009296 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 decimal = Py_UNICODE_TODECIMAL(ch);
9298 if (decimal >= 0) {
9299 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009300 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 continue;
9302 }
9303 if (0 < ch && ch < 256) {
9304 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009305 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 continue;
9307 }
Victor Stinner6345be92011-11-25 20:09:01 +01009308
Victor Stinner42bf7752011-11-21 22:52:58 +01009309 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009310 exc = NULL;
9311 raise_encode_exception(&exc, "decimal", unicode,
9312 startpos, startpos+1,
9313 "invalid decimal Unicode string");
9314 Py_XDECREF(exc);
9315 Py_DECREF(unicode);
9316 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009317 }
9318 /* 0-terminate the output string */
9319 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009320 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009321 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009322}
9323
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324/* --- Helpers ------------------------------------------------------------ */
9325
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009326/* helper macro to fixup start/end slice values */
9327#define ADJUST_INDICES(start, end, len) \
9328 if (end > len) \
9329 end = len; \
9330 else if (end < 0) { \
9331 end += len; \
9332 if (end < 0) \
9333 end = 0; \
9334 } \
9335 if (start < 0) { \
9336 start += len; \
9337 if (start < 0) \
9338 start = 0; \
9339 }
9340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009342any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009344 Py_ssize_t end,
9345 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009347 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 void *buf1, *buf2;
9349 Py_ssize_t len1, len2, result;
9350
9351 kind1 = PyUnicode_KIND(s1);
9352 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009353 if (kind1 < kind2)
9354 return -1;
9355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 len1 = PyUnicode_GET_LENGTH(s1);
9357 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009358 ADJUST_INDICES(start, end, len1);
9359 if (end - start < len2)
9360 return -1;
9361
9362 buf1 = PyUnicode_DATA(s1);
9363 buf2 = PyUnicode_DATA(s2);
9364 if (len2 == 1) {
9365 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9366 result = findchar((const char *)buf1 + kind1*start,
9367 kind1, end - start, ch, direction);
9368 if (result == -1)
9369 return -1;
9370 else
9371 return start + result;
9372 }
9373
9374 if (kind2 != kind1) {
9375 buf2 = _PyUnicode_AsKind(s2, kind1);
9376 if (!buf2)
9377 return -2;
9378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379
Victor Stinner794d5672011-10-10 03:21:36 +02009380 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009381 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009382 case PyUnicode_1BYTE_KIND:
9383 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9384 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9385 else
9386 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9387 break;
9388 case PyUnicode_2BYTE_KIND:
9389 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9390 break;
9391 case PyUnicode_4BYTE_KIND:
9392 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9393 break;
9394 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009395 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009396 }
9397 }
9398 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009399 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009400 case PyUnicode_1BYTE_KIND:
9401 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9402 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9403 else
9404 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9405 break;
9406 case PyUnicode_2BYTE_KIND:
9407 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9408 break;
9409 case PyUnicode_4BYTE_KIND:
9410 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9411 break;
9412 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009413 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 }
9416
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009417 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 PyMem_Free(buf2);
9419
9420 return result;
9421}
9422
Victor Stinner59423e32018-11-26 13:40:01 +01009423/* _PyUnicode_InsertThousandsGrouping() helper functions */
9424#include "stringlib/localeutil.h"
9425
9426/**
9427 * InsertThousandsGrouping:
9428 * @writer: Unicode writer.
9429 * @n_buffer: Number of characters in @buffer.
9430 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9431 * @d_pos: Start of digits string.
9432 * @n_digits: The number of digits in the string, in which we want
9433 * to put the grouping chars.
9434 * @min_width: The minimum width of the digits in the output string.
9435 * Output will be zero-padded on the left to fill.
9436 * @grouping: see definition in localeconv().
9437 * @thousands_sep: see definition in localeconv().
9438 *
9439 * There are 2 modes: counting and filling. If @writer is NULL,
9440 * we are in counting mode, else filling mode.
9441 * If counting, the required buffer size is returned.
9442 * If filling, we know the buffer will be large enough, so we don't
9443 * need to pass in the buffer size.
9444 * Inserts thousand grouping characters (as defined by grouping and
9445 * thousands_sep) into @writer.
9446 *
9447 * Return value: -1 on error, number of characters otherwise.
9448 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009450_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009451 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009452 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009453 PyObject *digits,
9454 Py_ssize_t d_pos,
9455 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009456 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009457 const char *grouping,
9458 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009459 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460{
Xtreak3f7983a2019-01-07 20:39:14 +05309461 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009462 if (writer) {
9463 assert(digits != NULL);
9464 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009465 }
9466 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009467 assert(digits == NULL);
9468 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009469 }
Victor Stinner59423e32018-11-26 13:40:01 +01009470 assert(0 <= d_pos);
9471 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009472 assert(grouping != NULL);
9473
9474 if (digits != NULL) {
9475 if (PyUnicode_READY(digits) == -1) {
9476 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009477 }
Victor Stinner59423e32018-11-26 13:40:01 +01009478 }
9479 if (PyUnicode_READY(thousands_sep) == -1) {
9480 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009481 }
9482
Victor Stinner59423e32018-11-26 13:40:01 +01009483 Py_ssize_t count = 0;
9484 Py_ssize_t n_zeros;
9485 int loop_broken = 0;
9486 int use_separator = 0; /* First time through, don't append the
9487 separator. They only go between
9488 groups. */
9489 Py_ssize_t buffer_pos;
9490 Py_ssize_t digits_pos;
9491 Py_ssize_t len;
9492 Py_ssize_t n_chars;
9493 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9494 be looked at */
9495 /* A generator that returns all of the grouping widths, until it
9496 returns 0. */
9497 GroupGenerator groupgen;
9498 GroupGenerator_init(&groupgen, grouping);
9499 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9500
9501 /* if digits are not grouped, thousands separator
9502 should be an empty string */
9503 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9504
9505 digits_pos = d_pos + n_digits;
9506 if (writer) {
9507 buffer_pos = writer->pos + n_buffer;
9508 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9509 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 }
Victor Stinner59423e32018-11-26 13:40:01 +01009511 else {
9512 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009513 }
Victor Stinner59423e32018-11-26 13:40:01 +01009514
9515 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009516 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009517 }
Victor Stinner59423e32018-11-26 13:40:01 +01009518
9519 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9520 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9521 n_zeros = Py_MAX(0, len - remaining);
9522 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9523
9524 /* Use n_zero zero's and n_chars chars */
9525
9526 /* Count only, don't do anything. */
9527 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9528
9529 /* Copy into the writer. */
9530 InsertThousandsGrouping_fill(writer, &buffer_pos,
9531 digits, &digits_pos,
9532 n_chars, n_zeros,
9533 use_separator ? thousands_sep : NULL,
9534 thousands_sep_len, maxchar);
9535
9536 /* Use a separator next time. */
9537 use_separator = 1;
9538
9539 remaining -= n_chars;
9540 min_width -= len;
9541
9542 if (remaining <= 0 && min_width <= 0) {
9543 loop_broken = 1;
9544 break;
9545 }
9546 min_width -= thousands_sep_len;
9547 }
9548 if (!loop_broken) {
9549 /* We left the loop without using a break statement. */
9550
9551 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9552 n_zeros = Py_MAX(0, len - remaining);
9553 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9554
9555 /* Use n_zero zero's and n_chars chars */
9556 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9557
9558 /* Copy into the writer. */
9559 InsertThousandsGrouping_fill(writer, &buffer_pos,
9560 digits, &digits_pos,
9561 n_chars, n_zeros,
9562 use_separator ? thousands_sep : NULL,
9563 thousands_sep_len, maxchar);
9564 }
9565 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566}
9567
9568
Alexander Belopolsky40018472011-02-26 01:02:56 +00009569Py_ssize_t
9570PyUnicode_Count(PyObject *str,
9571 PyObject *substr,
9572 Py_ssize_t start,
9573 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009575 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009576 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 void *buf1 = NULL, *buf2 = NULL;
9578 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009580 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009582
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009583 kind1 = PyUnicode_KIND(str);
9584 kind2 = PyUnicode_KIND(substr);
9585 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009586 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 len1 = PyUnicode_GET_LENGTH(str);
9589 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009591 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009593
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009594 buf1 = PyUnicode_DATA(str);
9595 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009597 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009598 if (!buf2)
9599 goto onError;
9600 }
9601
9602 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009604 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009605 result = asciilib_count(
9606 ((Py_UCS1*)buf1) + start, end - start,
9607 buf2, len2, PY_SSIZE_T_MAX
9608 );
9609 else
9610 result = ucs1lib_count(
9611 ((Py_UCS1*)buf1) + start, end - start,
9612 buf2, len2, PY_SSIZE_T_MAX
9613 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 break;
9615 case PyUnicode_2BYTE_KIND:
9616 result = ucs2lib_count(
9617 ((Py_UCS2*)buf1) + start, end - start,
9618 buf2, len2, PY_SSIZE_T_MAX
9619 );
9620 break;
9621 case PyUnicode_4BYTE_KIND:
9622 result = ucs4lib_count(
9623 ((Py_UCS4*)buf1) + start, end - start,
9624 buf2, len2, PY_SSIZE_T_MAX
9625 );
9626 break;
9627 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009628 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009630
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009631 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 PyMem_Free(buf2);
9633
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009636 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 PyMem_Free(buf2);
9638 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639}
9640
Alexander Belopolsky40018472011-02-26 01:02:56 +00009641Py_ssize_t
9642PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009643 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009644 Py_ssize_t start,
9645 Py_ssize_t end,
9646 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009650
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009651 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654Py_ssize_t
9655PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9656 Py_ssize_t start, Py_ssize_t end,
9657 int direction)
9658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009660 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 if (PyUnicode_READY(str) == -1)
9662 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009663 len = PyUnicode_GET_LENGTH(str);
9664 ADJUST_INDICES(start, end, len);
9665 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009666 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009668 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9669 kind, end-start, ch, direction);
9670 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009672 else
9673 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674}
9675
Alexander Belopolsky40018472011-02-26 01:02:56 +00009676static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009677tailmatch(PyObject *self,
9678 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009679 Py_ssize_t start,
9680 Py_ssize_t end,
9681 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 int kind_self;
9684 int kind_sub;
9685 void *data_self;
9686 void *data_sub;
9687 Py_ssize_t offset;
9688 Py_ssize_t i;
9689 Py_ssize_t end_sub;
9690
9691 if (PyUnicode_READY(self) == -1 ||
9692 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009693 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9696 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009698 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009700 if (PyUnicode_GET_LENGTH(substring) == 0)
9701 return 1;
9702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 kind_self = PyUnicode_KIND(self);
9704 data_self = PyUnicode_DATA(self);
9705 kind_sub = PyUnicode_KIND(substring);
9706 data_sub = PyUnicode_DATA(substring);
9707 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9708
9709 if (direction > 0)
9710 offset = end;
9711 else
9712 offset = start;
9713
9714 if (PyUnicode_READ(kind_self, data_self, offset) ==
9715 PyUnicode_READ(kind_sub, data_sub, 0) &&
9716 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9717 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9718 /* If both are of the same kind, memcmp is sufficient */
9719 if (kind_self == kind_sub) {
9720 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009721 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 data_sub,
9723 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009724 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009726 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 else {
9728 /* We do not need to compare 0 and len(substring)-1 because
9729 the if statement above ensured already that they are equal
9730 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 for (i = 1; i < end_sub; ++i) {
9732 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9733 PyUnicode_READ(kind_sub, data_sub, i))
9734 return 0;
9735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 }
9739
9740 return 0;
9741}
9742
Alexander Belopolsky40018472011-02-26 01:02:56 +00009743Py_ssize_t
9744PyUnicode_Tailmatch(PyObject *str,
9745 PyObject *substr,
9746 Py_ssize_t start,
9747 Py_ssize_t end,
9748 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009750 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009752
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009753 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754}
9755
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756static PyObject *
9757ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9760 char *resdata, *data = PyUnicode_DATA(self);
9761 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009762
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 res = PyUnicode_New(len, 127);
9764 if (res == NULL)
9765 return NULL;
9766 resdata = PyUnicode_DATA(res);
9767 if (lower)
9768 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770 _Py_bytes_upper(resdata, data, len);
9771 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772}
9773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009775handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009777 Py_ssize_t j;
9778 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009779 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009780 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009781
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9783
9784 where ! is a negation and \p{xxx} is a character with property xxx.
9785 */
9786 for (j = i - 1; j >= 0; j--) {
9787 c = PyUnicode_READ(kind, data, j);
9788 if (!_PyUnicode_IsCaseIgnorable(c))
9789 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009791 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9792 if (final_sigma) {
9793 for (j = i + 1; j < length; j++) {
9794 c = PyUnicode_READ(kind, data, j);
9795 if (!_PyUnicode_IsCaseIgnorable(c))
9796 break;
9797 }
9798 final_sigma = j == length || !_PyUnicode_IsCased(c);
9799 }
9800 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801}
9802
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009803static int
9804lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9805 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009807 /* Obscure special case. */
9808 if (c == 0x3A3) {
9809 mapped[0] = handle_capital_sigma(kind, data, length, i);
9810 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813}
9814
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009815static Py_ssize_t
9816do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009818 Py_ssize_t i, k = 0;
9819 int n_res, j;
9820 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009821
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009822 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009823 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009825 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828 for (i = 1; i < length; i++) {
9829 c = PyUnicode_READ(kind, data, i);
9830 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9831 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009832 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009834 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009835 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009836 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837}
9838
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839static Py_ssize_t
9840do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9841 Py_ssize_t i, k = 0;
9842
9843 for (i = 0; i < length; i++) {
9844 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9845 int n_res, j;
9846 if (Py_UNICODE_ISUPPER(c)) {
9847 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9848 }
9849 else if (Py_UNICODE_ISLOWER(c)) {
9850 n_res = _PyUnicode_ToUpperFull(c, mapped);
9851 }
9852 else {
9853 n_res = 1;
9854 mapped[0] = c;
9855 }
9856 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009857 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009858 res[k++] = mapped[j];
9859 }
9860 }
9861 return k;
9862}
9863
9864static Py_ssize_t
9865do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9866 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868 Py_ssize_t i, k = 0;
9869
9870 for (i = 0; i < length; i++) {
9871 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9872 int n_res, j;
9873 if (lower)
9874 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9875 else
9876 n_res = _PyUnicode_ToUpperFull(c, mapped);
9877 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009878 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879 res[k++] = mapped[j];
9880 }
9881 }
9882 return k;
9883}
9884
9885static Py_ssize_t
9886do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887{
9888 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9889}
9890
9891static Py_ssize_t
9892do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9893{
9894 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9895}
9896
Benjamin Petersone51757f2012-01-12 21:10:29 -05009897static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009898do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9899{
9900 Py_ssize_t i, k = 0;
9901
9902 for (i = 0; i < length; i++) {
9903 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9904 Py_UCS4 mapped[3];
9905 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9906 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009907 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009908 res[k++] = mapped[j];
9909 }
9910 }
9911 return k;
9912}
9913
9914static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009915do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9916{
9917 Py_ssize_t i, k = 0;
9918 int previous_is_cased;
9919
9920 previous_is_cased = 0;
9921 for (i = 0; i < length; i++) {
9922 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9923 Py_UCS4 mapped[3];
9924 int n_res, j;
9925
9926 if (previous_is_cased)
9927 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9928 else
9929 n_res = _PyUnicode_ToTitleFull(c, mapped);
9930
9931 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009932 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009933 res[k++] = mapped[j];
9934 }
9935
9936 previous_is_cased = _PyUnicode_IsCased(c);
9937 }
9938 return k;
9939}
9940
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009941static PyObject *
9942case_operation(PyObject *self,
9943 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9944{
9945 PyObject *res = NULL;
9946 Py_ssize_t length, newlength = 0;
9947 int kind, outkind;
9948 void *data, *outdata;
9949 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9950
Benjamin Petersoneea48462012-01-16 14:28:50 -05009951 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009952
9953 kind = PyUnicode_KIND(self);
9954 data = PyUnicode_DATA(self);
9955 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009956 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009957 PyErr_SetString(PyExc_OverflowError, "string is too long");
9958 return NULL;
9959 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009960 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009961 if (tmp == NULL)
9962 return PyErr_NoMemory();
9963 newlength = perform(kind, data, length, tmp, &maxchar);
9964 res = PyUnicode_New(newlength, maxchar);
9965 if (res == NULL)
9966 goto leave;
9967 tmpend = tmp + newlength;
9968 outdata = PyUnicode_DATA(res);
9969 outkind = PyUnicode_KIND(res);
9970 switch (outkind) {
9971 case PyUnicode_1BYTE_KIND:
9972 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9973 break;
9974 case PyUnicode_2BYTE_KIND:
9975 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9976 break;
9977 case PyUnicode_4BYTE_KIND:
9978 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9979 break;
9980 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009981 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009982 }
9983 leave:
9984 PyMem_FREE(tmp);
9985 return res;
9986}
9987
Tim Peters8ce9f162004-08-27 01:49:32 +00009988PyObject *
9989PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009991 PyObject *res;
9992 PyObject *fseq;
9993 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009996 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009997 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009998 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009999 }
10000
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010001 /* NOTE: the following code can't call back into Python code,
10002 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010003 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010005 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010006 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010007 res = _PyUnicode_JoinArray(separator, items, seqlen);
10008 Py_DECREF(fseq);
10009 return res;
10010}
10011
10012PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010013_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010014{
10015 PyObject *res = NULL; /* the result */
10016 PyObject *sep = NULL;
10017 Py_ssize_t seplen;
10018 PyObject *item;
10019 Py_ssize_t sz, i, res_offset;
10020 Py_UCS4 maxchar;
10021 Py_UCS4 item_maxchar;
10022 int use_memcpy;
10023 unsigned char *res_data = NULL, *sep_data = NULL;
10024 PyObject *last_obj;
10025 unsigned int kind = 0;
10026
Tim Peters05eba1f2004-08-27 21:32:02 +000010027 /* If empty sequence, return u"". */
10028 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010029 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010030 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010031
Tim Peters05eba1f2004-08-27 21:32:02 +000010032 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010034 if (seqlen == 1) {
10035 if (PyUnicode_CheckExact(items[0])) {
10036 res = items[0];
10037 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010038 return res;
10039 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010040 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010041 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010042 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010043 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010044 /* Set up sep and seplen */
10045 if (separator == NULL) {
10046 /* fall back to a blank space separator */
10047 sep = PyUnicode_FromOrdinal(' ');
10048 if (!sep)
10049 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010051 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010052 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010053 else {
10054 if (!PyUnicode_Check(separator)) {
10055 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010056 "separator: expected str instance,"
10057 " %.80s found",
10058 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010059 goto onError;
10060 }
10061 if (PyUnicode_READY(separator))
10062 goto onError;
10063 sep = separator;
10064 seplen = PyUnicode_GET_LENGTH(separator);
10065 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10066 /* inc refcount to keep this code path symmetric with the
10067 above case of a blank separator */
10068 Py_INCREF(sep);
10069 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010071 }
10072
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010073 /* There are at least two things to join, or else we have a subclass
10074 * of str in the sequence.
10075 * Do a pre-pass to figure out the total amount of space we'll
10076 * need (sz), and see whether all argument are strings.
10077 */
10078 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010079#ifdef Py_DEBUG
10080 use_memcpy = 0;
10081#else
10082 use_memcpy = 1;
10083#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010084 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010085 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010086 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010087 if (!PyUnicode_Check(item)) {
10088 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010089 "sequence item %zd: expected str instance,"
10090 " %.80s found",
10091 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010092 goto onError;
10093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 if (PyUnicode_READY(item) == -1)
10095 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010096 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010098 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010099 if (i != 0) {
10100 add_sz += seplen;
10101 }
10102 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010103 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010104 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010105 goto onError;
10106 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010107 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010108 if (use_memcpy && last_obj != NULL) {
10109 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10110 use_memcpy = 0;
10111 }
10112 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010113 }
Tim Petersced69f82003-09-16 20:30:58 +000010114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010116 if (res == NULL)
10117 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010118
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010119 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010120#ifdef Py_DEBUG
10121 use_memcpy = 0;
10122#else
10123 if (use_memcpy) {
10124 res_data = PyUnicode_1BYTE_DATA(res);
10125 kind = PyUnicode_KIND(res);
10126 if (seplen != 0)
10127 sep_data = PyUnicode_1BYTE_DATA(sep);
10128 }
10129#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010130 if (use_memcpy) {
10131 for (i = 0; i < seqlen; ++i) {
10132 Py_ssize_t itemlen;
10133 item = items[i];
10134
10135 /* Copy item, and maybe the separator. */
10136 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010137 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010138 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010139 kind * seplen);
10140 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010141 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010142
10143 itemlen = PyUnicode_GET_LENGTH(item);
10144 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010145 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010146 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 kind * itemlen);
10148 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010149 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010150 }
10151 assert(res_data == PyUnicode_1BYTE_DATA(res)
10152 + kind * PyUnicode_GET_LENGTH(res));
10153 }
10154 else {
10155 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10156 Py_ssize_t itemlen;
10157 item = items[i];
10158
10159 /* Copy item, and maybe the separator. */
10160 if (i && seplen != 0) {
10161 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10162 res_offset += seplen;
10163 }
10164
10165 itemlen = PyUnicode_GET_LENGTH(item);
10166 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010167 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010168 res_offset += itemlen;
10169 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010170 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010171 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010172 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010175 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010180 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181 return NULL;
10182}
10183
Victor Stinnerd3f08822012-05-29 12:57:52 +020010184void
10185_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10186 Py_UCS4 fill_char)
10187{
10188 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010189 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010190 assert(PyUnicode_IS_READY(unicode));
10191 assert(unicode_modifiable(unicode));
10192 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10193 assert(start >= 0);
10194 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010195 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010196}
10197
Victor Stinner3fe55312012-01-04 00:33:50 +010010198Py_ssize_t
10199PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10200 Py_UCS4 fill_char)
10201{
10202 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010203
10204 if (!PyUnicode_Check(unicode)) {
10205 PyErr_BadInternalCall();
10206 return -1;
10207 }
10208 if (PyUnicode_READY(unicode) == -1)
10209 return -1;
10210 if (unicode_check_modifiable(unicode))
10211 return -1;
10212
Victor Stinnerd3f08822012-05-29 12:57:52 +020010213 if (start < 0) {
10214 PyErr_SetString(PyExc_IndexError, "string index out of range");
10215 return -1;
10216 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010217 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10218 PyErr_SetString(PyExc_ValueError,
10219 "fill character is bigger than "
10220 "the string maximum character");
10221 return -1;
10222 }
10223
10224 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10225 length = Py_MIN(maxlen, length);
10226 if (length <= 0)
10227 return 0;
10228
Victor Stinnerd3f08822012-05-29 12:57:52 +020010229 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010230 return length;
10231}
10232
Victor Stinner9310abb2011-10-05 00:59:23 +020010233static PyObject *
10234pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010235 Py_ssize_t left,
10236 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 PyObject *u;
10240 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010241 int kind;
10242 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
10244 if (left < 0)
10245 left = 0;
10246 if (right < 0)
10247 right = 0;
10248
Victor Stinnerc4b49542011-12-11 22:44:26 +010010249 if (left == 0 && right == 0)
10250 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10253 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010254 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10255 return NULL;
10256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010258 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010260 if (!u)
10261 return NULL;
10262
10263 kind = PyUnicode_KIND(u);
10264 data = PyUnicode_DATA(u);
10265 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010266 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010267 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010268 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010269 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010270 assert(_PyUnicode_CheckConsistency(u, 1));
10271 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272}
10273
Alexander Belopolsky40018472011-02-26 01:02:56 +000010274PyObject *
10275PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010279 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281
Benjamin Petersonead6b532011-12-20 17:23:42 -060010282 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 if (PyUnicode_IS_ASCII(string))
10285 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010287 PyUnicode_GET_LENGTH(string), keepends);
10288 else
10289 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 break;
10293 case PyUnicode_2BYTE_KIND:
10294 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 PyUnicode_GET_LENGTH(string), keepends);
10297 break;
10298 case PyUnicode_4BYTE_KIND:
10299 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 PyUnicode_GET_LENGTH(string), keepends);
10302 break;
10303 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010304 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307}
10308
Alexander Belopolsky40018472011-02-26 01:02:56 +000010309static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010310split(PyObject *self,
10311 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010312 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010314 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 void *buf1, *buf2;
10316 Py_ssize_t len1, len2;
10317 PyObject* out;
10318
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010320 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (PyUnicode_READY(self) == -1)
10323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010326 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 if (PyUnicode_IS_ASCII(self))
10329 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010330 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010331 PyUnicode_GET_LENGTH(self), maxcount
10332 );
10333 else
10334 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 case PyUnicode_2BYTE_KIND:
10339 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
10343 case PyUnicode_4BYTE_KIND:
10344 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 PyUnicode_GET_LENGTH(self), maxcount
10347 );
10348 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010349 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 }
10351
10352 if (PyUnicode_READY(substring) == -1)
10353 return NULL;
10354
10355 kind1 = PyUnicode_KIND(self);
10356 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 len1 = PyUnicode_GET_LENGTH(self);
10358 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010359 if (kind1 < kind2 || len1 < len2) {
10360 out = PyList_New(1);
10361 if (out == NULL)
10362 return NULL;
10363 Py_INCREF(self);
10364 PyList_SET_ITEM(out, 0, self);
10365 return out;
10366 }
10367 buf1 = PyUnicode_DATA(self);
10368 buf2 = PyUnicode_DATA(substring);
10369 if (kind2 != kind1) {
10370 buf2 = _PyUnicode_AsKind(substring, kind1);
10371 if (!buf2)
10372 return NULL;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010375 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10378 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010379 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010380 else
10381 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_2BYTE_KIND:
10385 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 case PyUnicode_4BYTE_KIND:
10389 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010390 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 break;
10392 default:
10393 out = NULL;
10394 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 PyMem_Free(buf2);
10397 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398}
10399
Alexander Belopolsky40018472011-02-26 01:02:56 +000010400static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010401rsplit(PyObject *self,
10402 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010404{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010405 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 void *buf1, *buf2;
10407 Py_ssize_t len1, len2;
10408 PyObject* out;
10409
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010410 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010411 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 if (PyUnicode_READY(self) == -1)
10414 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010417 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419 if (PyUnicode_IS_ASCII(self))
10420 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010421 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010422 PyUnicode_GET_LENGTH(self), maxcount
10423 );
10424 else
10425 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010426 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 PyUnicode_GET_LENGTH(self), maxcount
10428 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 case PyUnicode_2BYTE_KIND:
10430 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010431 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 PyUnicode_GET_LENGTH(self), maxcount
10433 );
10434 case PyUnicode_4BYTE_KIND:
10435 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 PyUnicode_GET_LENGTH(self), maxcount
10438 );
10439 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010440 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 }
10442
10443 if (PyUnicode_READY(substring) == -1)
10444 return NULL;
10445
10446 kind1 = PyUnicode_KIND(self);
10447 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 len1 = PyUnicode_GET_LENGTH(self);
10449 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010450 if (kind1 < kind2 || len1 < len2) {
10451 out = PyList_New(1);
10452 if (out == NULL)
10453 return NULL;
10454 Py_INCREF(self);
10455 PyList_SET_ITEM(out, 0, self);
10456 return out;
10457 }
10458 buf1 = PyUnicode_DATA(self);
10459 buf2 = PyUnicode_DATA(substring);
10460 if (kind2 != kind1) {
10461 buf2 = _PyUnicode_AsKind(substring, kind1);
10462 if (!buf2)
10463 return NULL;
10464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010466 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010468 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10469 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010470 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010471 else
10472 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010473 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 break;
10475 case PyUnicode_2BYTE_KIND:
10476 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 break;
10479 case PyUnicode_4BYTE_KIND:
10480 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010481 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 break;
10483 default:
10484 out = NULL;
10485 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010486 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 PyMem_Free(buf2);
10488 return out;
10489}
10490
10491static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010492anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10493 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010495 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010497 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10498 return asciilib_find(buf1, len1, buf2, len2, offset);
10499 else
10500 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 case PyUnicode_2BYTE_KIND:
10502 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10503 case PyUnicode_4BYTE_KIND:
10504 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10505 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010506 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507}
10508
10509static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010510anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10511 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010513 switch (kind) {
10514 case PyUnicode_1BYTE_KIND:
10515 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10516 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10517 else
10518 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10519 case PyUnicode_2BYTE_KIND:
10520 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10521 case PyUnicode_4BYTE_KIND:
10522 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10523 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010524 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010525}
10526
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527static void
10528replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10529 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10530{
10531 int kind = PyUnicode_KIND(u);
10532 void *data = PyUnicode_DATA(u);
10533 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10534 if (kind == PyUnicode_1BYTE_KIND) {
10535 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10536 (Py_UCS1 *)data + len,
10537 u1, u2, maxcount);
10538 }
10539 else if (kind == PyUnicode_2BYTE_KIND) {
10540 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10541 (Py_UCS2 *)data + len,
10542 u1, u2, maxcount);
10543 }
10544 else {
10545 assert(kind == PyUnicode_4BYTE_KIND);
10546 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10547 (Py_UCS4 *)data + len,
10548 u1, u2, maxcount);
10549 }
10550}
10551
Alexander Belopolsky40018472011-02-26 01:02:56 +000010552static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553replace(PyObject *self, PyObject *str1,
10554 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 PyObject *u;
10557 char *sbuf = PyUnicode_DATA(self);
10558 char *buf1 = PyUnicode_DATA(str1);
10559 char *buf2 = PyUnicode_DATA(str2);
10560 int srelease = 0, release1 = 0, release2 = 0;
10561 int skind = PyUnicode_KIND(self);
10562 int kind1 = PyUnicode_KIND(str1);
10563 int kind2 = PyUnicode_KIND(str2);
10564 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10565 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10566 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010567 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010568 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010570 if (slen < len1)
10571 goto nothing;
10572
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010575 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
Victor Stinner59de0ee2011-10-07 10:01:28 +020010578 if (str1 == str2)
10579 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010582 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10583 if (maxchar < maxchar_str1)
10584 /* substring too wide to be present */
10585 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010586 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10587 /* Replacing str1 with str2 may cause a maxchar reduction in the
10588 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010589 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010590 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010595 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010597 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010599 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010600
Victor Stinner69ed0f42013-04-09 21:48:24 +020010601 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010602 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010603 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010605 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010607 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010609
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010610 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10611 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 }
10613 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 int rkind = skind;
10615 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010616 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 if (kind1 < rkind) {
10619 /* widen substring */
10620 buf1 = _PyUnicode_AsKind(str1, rkind);
10621 if (!buf1) goto error;
10622 release1 = 1;
10623 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010624 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010625 if (i < 0)
10626 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 if (rkind > kind2) {
10628 /* widen replacement */
10629 buf2 = _PyUnicode_AsKind(str2, rkind);
10630 if (!buf2) goto error;
10631 release2 = 1;
10632 }
10633 else if (rkind < kind2) {
10634 /* widen self and buf1 */
10635 rkind = kind2;
10636 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010637 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 sbuf = _PyUnicode_AsKind(self, rkind);
10639 if (!sbuf) goto error;
10640 srelease = 1;
10641 buf1 = _PyUnicode_AsKind(str1, rkind);
10642 if (!buf1) goto error;
10643 release1 = 1;
10644 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010645 u = PyUnicode_New(slen, maxchar);
10646 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010648 assert(PyUnicode_KIND(u) == rkind);
10649 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010650
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010651 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010652 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010657
10658 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010659 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010662 if (i == -1)
10663 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010670 }
10671 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010673 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 int rkind = skind;
10675 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010678 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 buf1 = _PyUnicode_AsKind(str1, rkind);
10680 if (!buf1) goto error;
10681 release1 = 1;
10682 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010683 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 if (n == 0)
10685 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 buf2 = _PyUnicode_AsKind(str2, rkind);
10689 if (!buf2) goto error;
10690 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010693 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 rkind = kind2;
10695 sbuf = _PyUnicode_AsKind(self, rkind);
10696 if (!sbuf) goto error;
10697 srelease = 1;
10698 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010699 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 buf1 = _PyUnicode_AsKind(str1, rkind);
10701 if (!buf1) goto error;
10702 release1 = 1;
10703 }
10704 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10705 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010706 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 PyErr_SetString(PyExc_OverflowError,
10708 "replace string is too long");
10709 goto error;
10710 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010711 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010712 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010713 _Py_INCREF_UNICODE_EMPTY();
10714 if (!unicode_empty)
10715 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010716 u = unicode_empty;
10717 goto done;
10718 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010719 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 PyErr_SetString(PyExc_OverflowError,
10721 "replace string is too long");
10722 goto error;
10723 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010724 u = PyUnicode_New(new_size, maxchar);
10725 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010727 assert(PyUnicode_KIND(u) == rkind);
10728 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 ires = i = 0;
10730 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010731 while (n-- > 0) {
10732 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010733 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010734 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010735 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010736 if (j == -1)
10737 break;
10738 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010739 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010740 memcpy(res + rkind * ires,
10741 sbuf + rkind * i,
10742 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010744 }
10745 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010747 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010749 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010755 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 memcpy(res + rkind * ires,
10757 sbuf + rkind * i,
10758 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010759 }
10760 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010761 /* interleave */
10762 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010763 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010767 if (--n <= 0)
10768 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010769 memcpy(res + rkind * ires,
10770 sbuf + rkind * i,
10771 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 ires++;
10773 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010774 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010775 memcpy(res + rkind * ires,
10776 sbuf + rkind * i,
10777 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010778 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010779 }
10780
10781 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010782 unicode_adjust_maxchar(&u);
10783 if (u == NULL)
10784 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010786
10787 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (srelease)
10789 PyMem_FREE(sbuf);
10790 if (release1)
10791 PyMem_FREE(buf1);
10792 if (release2)
10793 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010794 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796
Benjamin Peterson29060642009-01-31 22:14:21 +000010797 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010798 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (srelease)
10800 PyMem_FREE(sbuf);
10801 if (release1)
10802 PyMem_FREE(buf1);
10803 if (release2)
10804 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010805 return unicode_result_unchanged(self);
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 error:
10808 if (srelease && sbuf)
10809 PyMem_FREE(sbuf);
10810 if (release1 && buf1)
10811 PyMem_FREE(buf1);
10812 if (release2 && buf2)
10813 PyMem_FREE(buf2);
10814 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815}
10816
10817/* --- Unicode Object Methods --------------------------------------------- */
10818
INADA Naoki3ae20562017-01-16 20:41:20 +090010819/*[clinic input]
10820str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
INADA Naoki3ae20562017-01-16 20:41:20 +090010822Return a version of the string where each word is titlecased.
10823
10824More specifically, words start with uppercased characters and all remaining
10825cased characters have lower case.
10826[clinic start generated code]*/
10827
10828static PyObject *
10829unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010830/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010832 if (PyUnicode_READY(self) == -1)
10833 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010834 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835}
10836
INADA Naoki3ae20562017-01-16 20:41:20 +090010837/*[clinic input]
10838str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
INADA Naoki3ae20562017-01-16 20:41:20 +090010840Return a capitalized version of the string.
10841
10842More specifically, make the first character have upper case and the rest lower
10843case.
10844[clinic start generated code]*/
10845
10846static PyObject *
10847unicode_capitalize_impl(PyObject *self)
10848/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010850 if (PyUnicode_READY(self) == -1)
10851 return NULL;
10852 if (PyUnicode_GET_LENGTH(self) == 0)
10853 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010854 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855}
10856
INADA Naoki3ae20562017-01-16 20:41:20 +090010857/*[clinic input]
10858str.casefold as unicode_casefold
10859
10860Return a version of the string suitable for caseless comparisons.
10861[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010862
10863static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010864unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010865/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010866{
10867 if (PyUnicode_READY(self) == -1)
10868 return NULL;
10869 if (PyUnicode_IS_ASCII(self))
10870 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010871 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010872}
10873
10874
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010875/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010876
10877static int
10878convert_uc(PyObject *obj, void *addr)
10879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010881
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010882 if (!PyUnicode_Check(obj)) {
10883 PyErr_Format(PyExc_TypeError,
10884 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010885 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010886 return 0;
10887 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010888 if (PyUnicode_READY(obj) < 0)
10889 return 0;
10890 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010891 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010892 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893 return 0;
10894 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010895 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010896 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010897}
10898
INADA Naoki3ae20562017-01-16 20:41:20 +090010899/*[clinic input]
10900str.center as unicode_center
10901
10902 width: Py_ssize_t
10903 fillchar: Py_UCS4 = ' '
10904 /
10905
10906Return a centered string of length width.
10907
10908Padding is done using the specified fill character (default is a space).
10909[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910
10911static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010912unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10913/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010915 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
Benjamin Petersonbac79492012-01-14 13:34:47 -050010917 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918 return NULL;
10919
Victor Stinnerc4b49542011-12-11 22:44:26 +010010920 if (PyUnicode_GET_LENGTH(self) >= width)
10921 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
Victor Stinnerc4b49542011-12-11 22:44:26 +010010923 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 left = marg / 2 + (marg & width & 1);
10925
Victor Stinner9310abb2011-10-05 00:59:23 +020010926 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927}
10928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929/* This function assumes that str1 and str2 are readied by the caller. */
10930
Marc-André Lemburge5034372000-08-08 08:04:29 +000010931static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010932unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010933{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010934#define COMPARE(TYPE1, TYPE2) \
10935 do { \
10936 TYPE1* p1 = (TYPE1 *)data1; \
10937 TYPE2* p2 = (TYPE2 *)data2; \
10938 TYPE1* end = p1 + len; \
10939 Py_UCS4 c1, c2; \
10940 for (; p1 != end; p1++, p2++) { \
10941 c1 = *p1; \
10942 c2 = *p2; \
10943 if (c1 != c2) \
10944 return (c1 < c2) ? -1 : 1; \
10945 } \
10946 } \
10947 while (0)
10948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 int kind1, kind2;
10950 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010951 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 kind1 = PyUnicode_KIND(str1);
10954 kind2 = PyUnicode_KIND(str2);
10955 data1 = PyUnicode_DATA(str1);
10956 data2 = PyUnicode_DATA(str2);
10957 len1 = PyUnicode_GET_LENGTH(str1);
10958 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010959 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010960
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010961 switch(kind1) {
10962 case PyUnicode_1BYTE_KIND:
10963 {
10964 switch(kind2) {
10965 case PyUnicode_1BYTE_KIND:
10966 {
10967 int cmp = memcmp(data1, data2, len);
10968 /* normalize result of memcmp() into the range [-1; 1] */
10969 if (cmp < 0)
10970 return -1;
10971 if (cmp > 0)
10972 return 1;
10973 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010974 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010975 case PyUnicode_2BYTE_KIND:
10976 COMPARE(Py_UCS1, Py_UCS2);
10977 break;
10978 case PyUnicode_4BYTE_KIND:
10979 COMPARE(Py_UCS1, Py_UCS4);
10980 break;
10981 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010982 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010983 }
10984 break;
10985 }
10986 case PyUnicode_2BYTE_KIND:
10987 {
10988 switch(kind2) {
10989 case PyUnicode_1BYTE_KIND:
10990 COMPARE(Py_UCS2, Py_UCS1);
10991 break;
10992 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010993 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010994 COMPARE(Py_UCS2, Py_UCS2);
10995 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010996 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010997 case PyUnicode_4BYTE_KIND:
10998 COMPARE(Py_UCS2, Py_UCS4);
10999 break;
11000 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011001 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011002 }
11003 break;
11004 }
11005 case PyUnicode_4BYTE_KIND:
11006 {
11007 switch(kind2) {
11008 case PyUnicode_1BYTE_KIND:
11009 COMPARE(Py_UCS4, Py_UCS1);
11010 break;
11011 case PyUnicode_2BYTE_KIND:
11012 COMPARE(Py_UCS4, Py_UCS2);
11013 break;
11014 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011015 {
11016#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11017 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11018 /* normalize result of wmemcmp() into the range [-1; 1] */
11019 if (cmp < 0)
11020 return -1;
11021 if (cmp > 0)
11022 return 1;
11023#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011024 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011025#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011026 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011027 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011028 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011029 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011030 }
11031 break;
11032 }
11033 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011034 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011035 }
11036
Victor Stinner770e19e2012-10-04 22:59:45 +020011037 if (len1 == len2)
11038 return 0;
11039 if (len1 < len2)
11040 return -1;
11041 else
11042 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011043
11044#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011045}
11046
Benjamin Peterson621b4302016-09-09 13:54:34 -070011047static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011048unicode_compare_eq(PyObject *str1, PyObject *str2)
11049{
11050 int kind;
11051 void *data1, *data2;
11052 Py_ssize_t len;
11053 int cmp;
11054
Victor Stinnere5567ad2012-10-23 02:48:49 +020011055 len = PyUnicode_GET_LENGTH(str1);
11056 if (PyUnicode_GET_LENGTH(str2) != len)
11057 return 0;
11058 kind = PyUnicode_KIND(str1);
11059 if (PyUnicode_KIND(str2) != kind)
11060 return 0;
11061 data1 = PyUnicode_DATA(str1);
11062 data2 = PyUnicode_DATA(str2);
11063
11064 cmp = memcmp(data1, data2, len * kind);
11065 return (cmp == 0);
11066}
11067
11068
Alexander Belopolsky40018472011-02-26 01:02:56 +000011069int
11070PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11073 if (PyUnicode_READY(left) == -1 ||
11074 PyUnicode_READY(right) == -1)
11075 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011076
11077 /* a string is equal to itself */
11078 if (left == right)
11079 return 0;
11080
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011081 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011083 PyErr_Format(PyExc_TypeError,
11084 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011085 Py_TYPE(left)->tp_name,
11086 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 return -1;
11088}
11089
Martin v. Löwis5b222132007-06-10 09:51:05 +000011090int
11091PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 Py_ssize_t i;
11094 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011096 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097
Victor Stinner910337b2011-10-03 03:20:16 +020011098 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011099 if (!PyUnicode_IS_READY(uni)) {
11100 const wchar_t *ws = _PyUnicode_WSTR(uni);
11101 /* Compare Unicode string and source character set string */
11102 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11103 if (chr != ustr[i])
11104 return (chr < ustr[i]) ? -1 : 1;
11105 }
11106 /* This check keeps Python strings that end in '\0' from comparing equal
11107 to C strings identical up to that point. */
11108 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11109 return 1; /* uni is longer */
11110 if (ustr[i])
11111 return -1; /* str is longer */
11112 return 0;
11113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011115 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011116 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011117 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011118 size_t len, len2 = strlen(str);
11119 int cmp;
11120
11121 len = Py_MIN(len1, len2);
11122 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011123 if (cmp != 0) {
11124 if (cmp < 0)
11125 return -1;
11126 else
11127 return 1;
11128 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011129 if (len1 > len2)
11130 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011131 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011132 return -1; /* str is longer */
11133 return 0;
11134 }
11135 else {
11136 void *data = PyUnicode_DATA(uni);
11137 /* Compare Unicode string and source character set string */
11138 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011139 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011140 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11141 /* This check keeps Python strings that end in '\0' from comparing equal
11142 to C strings identical up to that point. */
11143 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11144 return 1; /* uni is longer */
11145 if (str[i])
11146 return -1; /* str is longer */
11147 return 0;
11148 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011149}
11150
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011151static int
11152non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11153{
11154 size_t i, len;
11155 const wchar_t *p;
11156 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11157 if (strlen(str) != len)
11158 return 0;
11159 p = _PyUnicode_WSTR(unicode);
11160 assert(p);
11161 for (i = 0; i < len; i++) {
11162 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011163 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011164 return 0;
11165 }
11166 return 1;
11167}
11168
11169int
11170_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11171{
11172 size_t len;
11173 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011174 assert(str);
11175#ifndef NDEBUG
11176 for (const char *p = str; *p; p++) {
11177 assert((unsigned char)*p < 128);
11178 }
11179#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011180 if (PyUnicode_READY(unicode) == -1) {
11181 /* Memory error or bad data */
11182 PyErr_Clear();
11183 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11184 }
11185 if (!PyUnicode_IS_ASCII(unicode))
11186 return 0;
11187 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11188 return strlen(str) == len &&
11189 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11190}
11191
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011192int
11193_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11194{
11195 PyObject *right_uni;
11196 Py_hash_t hash;
11197
11198 assert(_PyUnicode_CHECK(left));
11199 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011200#ifndef NDEBUG
11201 for (const char *p = right->string; *p; p++) {
11202 assert((unsigned char)*p < 128);
11203 }
11204#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011205
11206 if (PyUnicode_READY(left) == -1) {
11207 /* memory error or bad data */
11208 PyErr_Clear();
11209 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11210 }
11211
11212 if (!PyUnicode_IS_ASCII(left))
11213 return 0;
11214
11215 right_uni = _PyUnicode_FromId(right); /* borrowed */
11216 if (right_uni == NULL) {
11217 /* memory error or bad data */
11218 PyErr_Clear();
11219 return _PyUnicode_EqualToASCIIString(left, right->string);
11220 }
11221
11222 if (left == right_uni)
11223 return 1;
11224
11225 if (PyUnicode_CHECK_INTERNED(left))
11226 return 0;
11227
INADA Naoki7cc95f52018-01-28 02:07:09 +090011228 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011229 hash = _PyUnicode_HASH(left);
11230 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11231 return 0;
11232
11233 return unicode_compare_eq(left, right_uni);
11234}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011235
Alexander Belopolsky40018472011-02-26 01:02:56 +000011236PyObject *
11237PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011238{
11239 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011240
Victor Stinnere5567ad2012-10-23 02:48:49 +020011241 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11242 Py_RETURN_NOTIMPLEMENTED;
11243
11244 if (PyUnicode_READY(left) == -1 ||
11245 PyUnicode_READY(right) == -1)
11246 return NULL;
11247
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011248 if (left == right) {
11249 switch (op) {
11250 case Py_EQ:
11251 case Py_LE:
11252 case Py_GE:
11253 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011254 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011255 case Py_NE:
11256 case Py_LT:
11257 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011258 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011259 default:
11260 PyErr_BadArgument();
11261 return NULL;
11262 }
11263 }
11264 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011265 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011266 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011267 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011268 }
11269 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011270 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011271 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011272 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011273}
11274
Alexander Belopolsky40018472011-02-26 01:02:56 +000011275int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011276_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11277{
11278 return unicode_eq(aa, bb);
11279}
11280
11281int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011282PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011283{
Victor Stinner77282cb2013-04-14 19:22:47 +020011284 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 void *buf1, *buf2;
11286 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011287 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011288
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011289 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011291 "'in <string>' requires string as left operand, not %.100s",
11292 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011293 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011294 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011296 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 if (ensure_unicode(str) < 0)
11298 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 kind2 = PyUnicode_KIND(substr);
11302 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011303 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 len2 = PyUnicode_GET_LENGTH(substr);
11306 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011307 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011308 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011310 if (len2 == 1) {
11311 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11312 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011313 return result;
11314 }
11315 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011316 buf2 = _PyUnicode_AsKind(substr, kind1);
11317 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011318 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320
Victor Stinner77282cb2013-04-14 19:22:47 +020011321 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 case PyUnicode_1BYTE_KIND:
11323 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11324 break;
11325 case PyUnicode_2BYTE_KIND:
11326 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11327 break;
11328 case PyUnicode_4BYTE_KIND:
11329 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11330 break;
11331 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011332 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011334
Victor Stinner77282cb2013-04-14 19:22:47 +020011335 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 PyMem_Free(buf2);
11337
Guido van Rossum403d68b2000-03-13 15:55:09 +000011338 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011339}
11340
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341/* Concat to string or Unicode object giving a new Unicode object. */
11342
Alexander Belopolsky40018472011-02-26 01:02:56 +000011343PyObject *
11344PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011346 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011347 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011348 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011350 if (ensure_unicode(left) < 0)
11351 return NULL;
11352
11353 if (!PyUnicode_Check(right)) {
11354 PyErr_Format(PyExc_TypeError,
11355 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011356 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011357 return NULL;
11358 }
11359 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011360 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361
11362 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011363 if (left == unicode_empty)
11364 return PyUnicode_FromObject(right);
11365 if (right == unicode_empty)
11366 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011368 left_len = PyUnicode_GET_LENGTH(left);
11369 right_len = PyUnicode_GET_LENGTH(right);
11370 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011371 PyErr_SetString(PyExc_OverflowError,
11372 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011374 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011376
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11378 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011379 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382 result = PyUnicode_New(new_len, maxchar);
11383 if (result == NULL)
11384 return NULL;
11385 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11386 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11387 assert(_PyUnicode_CheckConsistency(result, 1));
11388 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
Walter Dörwald1ab83302007-05-18 17:15:44 +000011391void
Victor Stinner23e56682011-10-03 03:54:37 +020011392PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011393{
Victor Stinner23e56682011-10-03 03:54:37 +020011394 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011395 Py_UCS4 maxchar, maxchar2;
11396 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011397
11398 if (p_left == NULL) {
11399 if (!PyErr_Occurred())
11400 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401 return;
11402 }
Victor Stinner23e56682011-10-03 03:54:37 +020011403 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011404 if (right == NULL || left == NULL
11405 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011406 if (!PyErr_Occurred())
11407 PyErr_BadInternalCall();
11408 goto error;
11409 }
11410
Benjamin Petersonbac79492012-01-14 13:34:47 -050011411 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011412 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011413 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011414 goto error;
11415
Victor Stinner488fa492011-12-12 00:01:39 +010011416 /* Shortcuts */
11417 if (left == unicode_empty) {
11418 Py_DECREF(left);
11419 Py_INCREF(right);
11420 *p_left = right;
11421 return;
11422 }
11423 if (right == unicode_empty)
11424 return;
11425
11426 left_len = PyUnicode_GET_LENGTH(left);
11427 right_len = PyUnicode_GET_LENGTH(right);
11428 if (left_len > PY_SSIZE_T_MAX - right_len) {
11429 PyErr_SetString(PyExc_OverflowError,
11430 "strings are too large to concat");
11431 goto error;
11432 }
11433 new_len = left_len + right_len;
11434
11435 if (unicode_modifiable(left)
11436 && PyUnicode_CheckExact(right)
11437 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011438 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11439 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011440 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011441 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011442 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11443 {
11444 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011445 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011446 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011447
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011448 /* copy 'right' into the newly allocated area of 'left' */
11449 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011450 }
Victor Stinner488fa492011-12-12 00:01:39 +010011451 else {
11452 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11453 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011454 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011455
Victor Stinner488fa492011-12-12 00:01:39 +010011456 /* Concat the two Unicode strings */
11457 res = PyUnicode_New(new_len, maxchar);
11458 if (res == NULL)
11459 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011460 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11461 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011462 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011463 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011464 }
11465 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011466 return;
11467
11468error:
Victor Stinner488fa492011-12-12 00:01:39 +010011469 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011470}
11471
11472void
11473PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11474{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011475 PyUnicode_Append(pleft, right);
11476 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011477}
11478
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011479/*
11480Wraps stringlib_parse_args_finds() and additionally ensures that the
11481first argument is a unicode object.
11482*/
11483
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011484static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011485parse_args_finds_unicode(const char * function_name, PyObject *args,
11486 PyObject **substring,
11487 Py_ssize_t *start, Py_ssize_t *end)
11488{
11489 if(stringlib_parse_args_finds(function_name, args, substring,
11490 start, end)) {
11491 if (ensure_unicode(*substring) < 0)
11492 return 0;
11493 return 1;
11494 }
11495 return 0;
11496}
11497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011498PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011502string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011503interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011506unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011508 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011509 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011510 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011512 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 void *buf1, *buf2;
11514 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011516 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 kind1 = PyUnicode_KIND(self);
11520 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011521 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011522 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 len1 = PyUnicode_GET_LENGTH(self);
11525 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011527 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011528 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011529
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011530 buf1 = PyUnicode_DATA(self);
11531 buf2 = PyUnicode_DATA(substring);
11532 if (kind2 != kind1) {
11533 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011534 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011535 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011536 }
11537 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 case PyUnicode_1BYTE_KIND:
11539 iresult = ucs1lib_count(
11540 ((Py_UCS1*)buf1) + start, end - start,
11541 buf2, len2, PY_SSIZE_T_MAX
11542 );
11543 break;
11544 case PyUnicode_2BYTE_KIND:
11545 iresult = ucs2lib_count(
11546 ((Py_UCS2*)buf1) + start, end - start,
11547 buf2, len2, PY_SSIZE_T_MAX
11548 );
11549 break;
11550 case PyUnicode_4BYTE_KIND:
11551 iresult = ucs4lib_count(
11552 ((Py_UCS4*)buf1) + start, end - start,
11553 buf2, len2, PY_SSIZE_T_MAX
11554 );
11555 break;
11556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011557 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 }
11559
11560 result = PyLong_FromSsize_t(iresult);
11561
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011562 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 return result;
11566}
11567
INADA Naoki3ae20562017-01-16 20:41:20 +090011568/*[clinic input]
11569str.encode as unicode_encode
11570
11571 encoding: str(c_default="NULL") = 'utf-8'
11572 The encoding in which to encode the string.
11573 errors: str(c_default="NULL") = 'strict'
11574 The error handling scheme to use for encoding errors.
11575 The default is 'strict' meaning that encoding errors raise a
11576 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11577 'xmlcharrefreplace' as well as any other name registered with
11578 codecs.register_error that can handle UnicodeEncodeErrors.
11579
11580Encode the string using the codec registered for encoding.
11581[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011584unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011585/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011587 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011588}
11589
INADA Naoki3ae20562017-01-16 20:41:20 +090011590/*[clinic input]
11591str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
INADA Naoki3ae20562017-01-16 20:41:20 +090011593 tabsize: int = 8
11594
11595Return a copy where all tab characters are expanded using spaces.
11596
11597If tabsize is not given, a tab size of 8 characters is assumed.
11598[clinic start generated code]*/
11599
11600static PyObject *
11601unicode_expandtabs_impl(PyObject *self, int tabsize)
11602/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 Py_ssize_t i, j, line_pos, src_len, incr;
11605 Py_UCS4 ch;
11606 PyObject *u;
11607 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011608 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011609 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
Antoine Pitrou22425222011-10-04 19:10:51 +020011611 if (PyUnicode_READY(self) == -1)
11612 return NULL;
11613
Thomas Wouters7e474022000-07-16 12:04:32 +000011614 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011615 src_len = PyUnicode_GET_LENGTH(self);
11616 i = j = line_pos = 0;
11617 kind = PyUnicode_KIND(self);
11618 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011619 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011620 for (; i < src_len; i++) {
11621 ch = PyUnicode_READ(kind, src_data, i);
11622 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011623 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011625 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011627 goto overflow;
11628 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011630 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011634 goto overflow;
11635 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011637 if (ch == '\n' || ch == '\r')
11638 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011640 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011641 if (!found)
11642 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011643
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011645 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646 if (!u)
11647 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011648 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Antoine Pitroue71d5742011-10-04 15:55:09 +020011650 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
Antoine Pitroue71d5742011-10-04 15:55:09 +020011652 for (; i < src_len; i++) {
11653 ch = PyUnicode_READ(kind, src_data, i);
11654 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011656 incr = tabsize - (line_pos % tabsize);
11657 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011658 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011659 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011661 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011663 line_pos++;
11664 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011665 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011666 if (ch == '\n' || ch == '\r')
11667 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011669 }
11670 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011671 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011672
Antoine Pitroue71d5742011-10-04 15:55:09 +020011673 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011674 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676}
11677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011678PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011679 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680\n\
11681Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011682such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683arguments start and end are interpreted as in slice notation.\n\
11684\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011685Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686
11687static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011690 /* initialize variables to prevent gcc warning */
11691 PyObject *substring = NULL;
11692 Py_ssize_t start = 0;
11693 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011694 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011696 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011699 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011702 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 if (result == -2)
11705 return NULL;
11706
Christian Heimes217cfd12007-12-02 14:31:20 +000011707 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708}
11709
11710static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011711unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011713 void *data;
11714 enum PyUnicode_Kind kind;
11715 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011716
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011717 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011718 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011720 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011721 if (PyUnicode_READY(self) == -1) {
11722 return NULL;
11723 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011724 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11725 PyErr_SetString(PyExc_IndexError, "string index out of range");
11726 return NULL;
11727 }
11728 kind = PyUnicode_KIND(self);
11729 data = PyUnicode_DATA(self);
11730 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011731 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732}
11733
Guido van Rossumc2504932007-09-18 19:42:40 +000011734/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011735 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011736static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011737unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011739 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011740
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011741#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011742 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011743#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (_PyUnicode_HASH(self) != -1)
11745 return _PyUnicode_HASH(self);
11746 if (PyUnicode_READY(self) == -1)
11747 return -1;
animalizea1d14252019-01-02 20:16:06 +080011748
Christian Heimes985ecdc2013-11-20 11:46:18 +010011749 x = _Py_HashBytes(PyUnicode_DATA(self),
11750 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011752 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753}
11754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011755PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757\n\
oldkaa0735f2018-02-02 16:52:55 +080011758Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011759such that sub is contained within S[start:end]. Optional\n\
11760arguments start and end are interpreted as in slice notation.\n\
11761\n\
11762Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
11764static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011767 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011768 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011769 PyObject *substring = NULL;
11770 Py_ssize_t start = 0;
11771 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011773 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011776 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011779 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (result == -2)
11782 return NULL;
11783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 if (result < 0) {
11785 PyErr_SetString(PyExc_ValueError, "substring not found");
11786 return NULL;
11787 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011788
Christian Heimes217cfd12007-12-02 14:31:20 +000011789 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790}
11791
INADA Naoki3ae20562017-01-16 20:41:20 +090011792/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011793str.isascii as unicode_isascii
11794
11795Return True if all characters in the string are ASCII, False otherwise.
11796
11797ASCII characters have code points in the range U+0000-U+007F.
11798Empty string is ASCII too.
11799[clinic start generated code]*/
11800
11801static PyObject *
11802unicode_isascii_impl(PyObject *self)
11803/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11804{
11805 if (PyUnicode_READY(self) == -1) {
11806 return NULL;
11807 }
11808 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11809}
11810
11811/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011812str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813
INADA Naoki3ae20562017-01-16 20:41:20 +090011814Return True if the string is a lowercase string, False otherwise.
11815
11816A string is lowercase if all cased characters in the string are lowercase and
11817there is at least one cased character in the string.
11818[clinic start generated code]*/
11819
11820static PyObject *
11821unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011822/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 Py_ssize_t i, length;
11825 int kind;
11826 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827 int cased;
11828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 if (PyUnicode_READY(self) == -1)
11830 return NULL;
11831 length = PyUnicode_GET_LENGTH(self);
11832 kind = PyUnicode_KIND(self);
11833 data = PyUnicode_DATA(self);
11834
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 if (length == 1)
11837 return PyBool_FromLong(
11838 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011840 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011842 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011843
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 for (i = 0; i < length; i++) {
11846 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011847
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011849 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 else if (!cased && Py_UNICODE_ISLOWER(ch))
11851 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011853 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854}
11855
INADA Naoki3ae20562017-01-16 20:41:20 +090011856/*[clinic input]
11857str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
INADA Naoki3ae20562017-01-16 20:41:20 +090011859Return True if the string is an uppercase string, False otherwise.
11860
11861A string is uppercase if all cased characters in the string are uppercase and
11862there is at least one cased character in the string.
11863[clinic start generated code]*/
11864
11865static PyObject *
11866unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011867/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 Py_ssize_t i, length;
11870 int kind;
11871 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872 int cased;
11873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (PyUnicode_READY(self) == -1)
11875 return NULL;
11876 length = PyUnicode_GET_LENGTH(self);
11877 kind = PyUnicode_KIND(self);
11878 data = PyUnicode_DATA(self);
11879
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (length == 1)
11882 return PyBool_FromLong(
11883 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011885 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011887 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 for (i = 0; i < length; i++) {
11891 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011892
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 else if (!cased && Py_UNICODE_ISUPPER(ch))
11896 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011898 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899}
11900
INADA Naoki3ae20562017-01-16 20:41:20 +090011901/*[clinic input]
11902str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
INADA Naoki3ae20562017-01-16 20:41:20 +090011904Return True if the string is a title-cased string, False otherwise.
11905
11906In a title-cased string, upper- and title-case characters may only
11907follow uncased characters and lowercase characters only cased ones.
11908[clinic start generated code]*/
11909
11910static PyObject *
11911unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011912/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_ssize_t i, length;
11915 int kind;
11916 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 int cased, previous_is_cased;
11918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (PyUnicode_READY(self) == -1)
11920 return NULL;
11921 length = PyUnicode_GET_LENGTH(self);
11922 kind = PyUnicode_KIND(self);
11923 data = PyUnicode_DATA(self);
11924
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (length == 1) {
11927 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11928 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11929 (Py_UNICODE_ISUPPER(ch) != 0));
11930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011934 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011935
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 cased = 0;
11937 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 for (i = 0; i < length; i++) {
11939 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011940
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11942 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 previous_is_cased = 1;
11945 cased = 1;
11946 }
11947 else if (Py_UNICODE_ISLOWER(ch)) {
11948 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011949 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 previous_is_cased = 1;
11951 cased = 1;
11952 }
11953 else
11954 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011956 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957}
11958
INADA Naoki3ae20562017-01-16 20:41:20 +090011959/*[clinic input]
11960str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
INADA Naoki3ae20562017-01-16 20:41:20 +090011962Return True if the string is a whitespace string, False otherwise.
11963
11964A string is whitespace if all characters in the string are whitespace and there
11965is at least one character in the string.
11966[clinic start generated code]*/
11967
11968static PyObject *
11969unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011970/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 Py_ssize_t i, length;
11973 int kind;
11974 void *data;
11975
11976 if (PyUnicode_READY(self) == -1)
11977 return NULL;
11978 length = PyUnicode_GET_LENGTH(self);
11979 kind = PyUnicode_KIND(self);
11980 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (length == 1)
11984 return PyBool_FromLong(
11985 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011987 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011989 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 for (i = 0; i < length; i++) {
11992 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011993 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011994 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011996 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997}
11998
INADA Naoki3ae20562017-01-16 20:41:20 +090011999/*[clinic input]
12000str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001
INADA Naoki3ae20562017-01-16 20:41:20 +090012002Return True if the string is an alphabetic string, False otherwise.
12003
12004A string is alphabetic if all characters in the string are alphabetic and there
12005is at least one character in the string.
12006[clinic start generated code]*/
12007
12008static PyObject *
12009unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012010/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 Py_ssize_t i, length;
12013 int kind;
12014 void *data;
12015
12016 if (PyUnicode_READY(self) == -1)
12017 return NULL;
12018 length = PyUnicode_GET_LENGTH(self);
12019 kind = PyUnicode_KIND(self);
12020 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012021
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012022 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 if (length == 1)
12024 return PyBool_FromLong(
12025 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012026
12027 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012029 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 for (i = 0; i < length; i++) {
12032 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012033 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012034 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012035 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012036}
12037
INADA Naoki3ae20562017-01-16 20:41:20 +090012038/*[clinic input]
12039str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012040
INADA Naoki3ae20562017-01-16 20:41:20 +090012041Return True if the string is an alpha-numeric string, False otherwise.
12042
12043A string is alpha-numeric if all characters in the string are alpha-numeric and
12044there is at least one character in the string.
12045[clinic start generated code]*/
12046
12047static PyObject *
12048unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012049/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 int kind;
12052 void *data;
12053 Py_ssize_t len, i;
12054
12055 if (PyUnicode_READY(self) == -1)
12056 return NULL;
12057
12058 kind = PyUnicode_KIND(self);
12059 data = PyUnicode_DATA(self);
12060 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012061
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012062 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (len == 1) {
12064 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12065 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12066 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012067
12068 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 for (i = 0; i < len; i++) {
12073 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012074 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012075 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012076 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012077 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012078}
12079
INADA Naoki3ae20562017-01-16 20:41:20 +090012080/*[clinic input]
12081str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
INADA Naoki3ae20562017-01-16 20:41:20 +090012083Return True if the string is a decimal string, False otherwise.
12084
12085A string is a decimal string if all characters in the string are decimal and
12086there is at least one character in the string.
12087[clinic start generated code]*/
12088
12089static PyObject *
12090unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012091/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 Py_ssize_t i, length;
12094 int kind;
12095 void *data;
12096
12097 if (PyUnicode_READY(self) == -1)
12098 return NULL;
12099 length = PyUnicode_GET_LENGTH(self);
12100 kind = PyUnicode_KIND(self);
12101 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (length == 1)
12105 return PyBool_FromLong(
12106 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012108 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012110 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 for (i = 0; i < length; i++) {
12113 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012114 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
INADA Naoki3ae20562017-01-16 20:41:20 +090012119/*[clinic input]
12120str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
INADA Naoki3ae20562017-01-16 20:41:20 +090012122Return True if the string is a digit string, False otherwise.
12123
12124A string is a digit string if all characters in the string are digits and there
12125is at least one character in the string.
12126[clinic start generated code]*/
12127
12128static PyObject *
12129unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012130/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 Py_ssize_t i, length;
12133 int kind;
12134 void *data;
12135
12136 if (PyUnicode_READY(self) == -1)
12137 return NULL;
12138 length = PyUnicode_GET_LENGTH(self);
12139 kind = PyUnicode_KIND(self);
12140 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141
Guido van Rossumd57fd912000-03-10 22:53:23 +000012142 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 if (length == 1) {
12144 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12145 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012148 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012150 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 for (i = 0; i < length; i++) {
12153 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012154 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157}
12158
INADA Naoki3ae20562017-01-16 20:41:20 +090012159/*[clinic input]
12160str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162Return True if the string is a numeric string, False otherwise.
12163
12164A string is numeric if all characters in the string are numeric and there is at
12165least one character in the string.
12166[clinic start generated code]*/
12167
12168static PyObject *
12169unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012170/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 Py_ssize_t i, length;
12173 int kind;
12174 void *data;
12175
12176 if (PyUnicode_READY(self) == -1)
12177 return NULL;
12178 length = PyUnicode_GET_LENGTH(self);
12179 kind = PyUnicode_KIND(self);
12180 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (length == 1)
12184 return PyBool_FromLong(
12185 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012187 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 for (i = 0; i < length; i++) {
12192 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012193 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196}
12197
Martin v. Löwis47383402007-08-15 07:32:56 +000012198int
12199PyUnicode_IsIdentifier(PyObject *self)
12200{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012202 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012203
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012204 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12205 if (len == 0) {
12206 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012207 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 }
12209
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012210 int kind;
12211 void *data;
12212 wchar_t *wstr;
12213 if (ready) {
12214 kind = PyUnicode_KIND(self);
12215 data = PyUnicode_DATA(self);
12216 }
12217 else {
12218 wstr = _PyUnicode_WSTR(self);
12219 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012220
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012221 Py_UCS4 ch;
12222 if (ready) {
12223 ch = PyUnicode_READ(kind, data, 0);
12224 }
12225 else {
12226 ch = wstr[0];
12227 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012228 /* PEP 3131 says that the first character must be in
12229 XID_Start and subsequent characters in XID_Continue,
12230 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012231 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012232 letters, digits, underscore). However, given the current
12233 definition of XID_Start and XID_Continue, it is sufficient
12234 to check just for these, except that _ must be allowed
12235 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012236 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012237 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012238 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012239
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012240 for (i = 1; i < len; i++) {
12241 if (ready) {
12242 ch = PyUnicode_READ(kind, data, i);
12243 }
12244 else {
12245 ch = wstr[i];
12246 }
12247 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012249 }
12250 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012251 return 1;
12252}
12253
INADA Naoki3ae20562017-01-16 20:41:20 +090012254/*[clinic input]
12255str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012256
INADA Naoki3ae20562017-01-16 20:41:20 +090012257Return True if the string is a valid Python identifier, False otherwise.
12258
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012259Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012260such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012261[clinic start generated code]*/
12262
12263static PyObject *
12264unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012265/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012266{
12267 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12268}
12269
INADA Naoki3ae20562017-01-16 20:41:20 +090012270/*[clinic input]
12271str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012272
INADA Naoki3ae20562017-01-16 20:41:20 +090012273Return True if the string is printable, False otherwise.
12274
12275A string is printable if all of its characters are considered printable in
12276repr() or if it is empty.
12277[clinic start generated code]*/
12278
12279static PyObject *
12280unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012281/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 Py_ssize_t i, length;
12284 int kind;
12285 void *data;
12286
12287 if (PyUnicode_READY(self) == -1)
12288 return NULL;
12289 length = PyUnicode_GET_LENGTH(self);
12290 kind = PyUnicode_KIND(self);
12291 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012292
12293 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 if (length == 1)
12295 return PyBool_FromLong(
12296 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 for (i = 0; i < length; i++) {
12299 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012300 Py_RETURN_FALSE;
12301 }
12302 }
12303 Py_RETURN_TRUE;
12304}
12305
INADA Naoki3ae20562017-01-16 20:41:20 +090012306/*[clinic input]
12307str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
INADA Naoki3ae20562017-01-16 20:41:20 +090012309 iterable: object
12310 /
12311
12312Concatenate any number of strings.
12313
Martin Panter91a88662017-01-24 00:30:06 +000012314The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012315The result is returned as a new string.
12316
12317Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12318[clinic start generated code]*/
12319
12320static PyObject *
12321unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012322/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
INADA Naoki3ae20562017-01-16 20:41:20 +090012324 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325}
12326
Martin v. Löwis18e16552006-02-15 17:27:45 +000012327static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012328unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 if (PyUnicode_READY(self) == -1)
12331 return -1;
12332 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333}
12334
INADA Naoki3ae20562017-01-16 20:41:20 +090012335/*[clinic input]
12336str.ljust as unicode_ljust
12337
12338 width: Py_ssize_t
12339 fillchar: Py_UCS4 = ' '
12340 /
12341
12342Return a left-justified string of length width.
12343
12344Padding is done using the specified fill character (default is a space).
12345[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346
12347static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012348unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12349/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012351 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
Victor Stinnerc4b49542011-12-11 22:44:26 +010012354 if (PyUnicode_GET_LENGTH(self) >= width)
12355 return unicode_result_unchanged(self);
12356
12357 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358}
12359
INADA Naoki3ae20562017-01-16 20:41:20 +090012360/*[clinic input]
12361str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362
INADA Naoki3ae20562017-01-16 20:41:20 +090012363Return a copy of the string converted to lowercase.
12364[clinic start generated code]*/
12365
12366static PyObject *
12367unicode_lower_impl(PyObject *self)
12368/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012370 if (PyUnicode_READY(self) == -1)
12371 return NULL;
12372 if (PyUnicode_IS_ASCII(self))
12373 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012374 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375}
12376
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012377#define LEFTSTRIP 0
12378#define RIGHTSTRIP 1
12379#define BOTHSTRIP 2
12380
12381/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012382static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012383
INADA Naoki3ae20562017-01-16 20:41:20 +090012384#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012385
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012386/* externally visible for str.strip(unicode) */
12387PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012388_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 void *data;
12391 int kind;
12392 Py_ssize_t i, j, len;
12393 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012394 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12397 return NULL;
12398
12399 kind = PyUnicode_KIND(self);
12400 data = PyUnicode_DATA(self);
12401 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012402 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12404 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012405 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012406
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 i = 0;
12408 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012409 while (i < len) {
12410 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12411 if (!BLOOM(sepmask, ch))
12412 break;
12413 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12414 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012415 i++;
12416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012417 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012418
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 j = len;
12420 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012421 j--;
12422 while (j >= i) {
12423 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12424 if (!BLOOM(sepmask, ch))
12425 break;
12426 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12427 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012429 }
12430
Benjamin Peterson29060642009-01-31 22:14:21 +000012431 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012432 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433
Victor Stinner7931d9a2011-11-04 00:22:48 +010012434 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435}
12436
12437PyObject*
12438PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12439{
12440 unsigned char *data;
12441 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012442 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443
Victor Stinnerde636f32011-10-01 03:55:54 +020012444 if (PyUnicode_READY(self) == -1)
12445 return NULL;
12446
Victor Stinner684d5fd2012-05-03 02:32:34 +020012447 length = PyUnicode_GET_LENGTH(self);
12448 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012449
Victor Stinner684d5fd2012-05-03 02:32:34 +020012450 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012451 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452
Victor Stinnerde636f32011-10-01 03:55:54 +020012453 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012454 PyErr_SetString(PyExc_IndexError, "string index out of range");
12455 return NULL;
12456 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012457 if (start >= length || end < start)
12458 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012459
Victor Stinner684d5fd2012-05-03 02:32:34 +020012460 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012461 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012462 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012463 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012464 }
12465 else {
12466 kind = PyUnicode_KIND(self);
12467 data = PyUnicode_1BYTE_DATA(self);
12468 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012469 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012470 length);
12471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473
12474static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012475do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 Py_ssize_t len, i, j;
12478
12479 if (PyUnicode_READY(self) == -1)
12480 return NULL;
12481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483
Victor Stinnercc7af722013-04-09 22:39:24 +020012484 if (PyUnicode_IS_ASCII(self)) {
12485 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12486
12487 i = 0;
12488 if (striptype != RIGHTSTRIP) {
12489 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012490 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012491 if (!_Py_ascii_whitespace[ch])
12492 break;
12493 i++;
12494 }
12495 }
12496
12497 j = len;
12498 if (striptype != LEFTSTRIP) {
12499 j--;
12500 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012501 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012502 if (!_Py_ascii_whitespace[ch])
12503 break;
12504 j--;
12505 }
12506 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012507 }
12508 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012509 else {
12510 int kind = PyUnicode_KIND(self);
12511 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012512
Victor Stinnercc7af722013-04-09 22:39:24 +020012513 i = 0;
12514 if (striptype != RIGHTSTRIP) {
12515 while (i < len) {
12516 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12517 if (!Py_UNICODE_ISSPACE(ch))
12518 break;
12519 i++;
12520 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012521 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012522
12523 j = len;
12524 if (striptype != LEFTSTRIP) {
12525 j--;
12526 while (j >= i) {
12527 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12528 if (!Py_UNICODE_ISSPACE(ch))
12529 break;
12530 j--;
12531 }
12532 j++;
12533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012534 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012535
Victor Stinner7931d9a2011-11-04 00:22:48 +010012536 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537}
12538
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012539
12540static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012541do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012542{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012543 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 if (PyUnicode_Check(sep))
12545 return _PyUnicode_XStrip(self, striptype, sep);
12546 else {
12547 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 "%s arg must be None or str",
12549 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012550 return NULL;
12551 }
12552 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012553
Benjamin Peterson14339b62009-01-31 16:36:08 +000012554 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012555}
12556
12557
INADA Naoki3ae20562017-01-16 20:41:20 +090012558/*[clinic input]
12559str.strip as unicode_strip
12560
12561 chars: object = None
12562 /
12563
Zachary Ware09895c22019-10-09 16:09:00 -050012564Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012565
12566If chars is given and not None, remove characters in chars instead.
12567[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012568
12569static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012570unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012571/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012572{
INADA Naoki3ae20562017-01-16 20:41:20 +090012573 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012574}
12575
12576
INADA Naoki3ae20562017-01-16 20:41:20 +090012577/*[clinic input]
12578str.lstrip as unicode_lstrip
12579
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012580 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012581 /
12582
12583Return a copy of the string with leading whitespace removed.
12584
12585If chars is given and not None, remove characters in chars instead.
12586[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012587
12588static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012589unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012590/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012591{
INADA Naoki3ae20562017-01-16 20:41:20 +090012592 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012593}
12594
12595
INADA Naoki3ae20562017-01-16 20:41:20 +090012596/*[clinic input]
12597str.rstrip as unicode_rstrip
12598
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012599 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012600 /
12601
12602Return a copy of the string with trailing whitespace removed.
12603
12604If chars is given and not None, remove characters in chars instead.
12605[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012606
12607static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012608unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012609/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012610{
INADA Naoki3ae20562017-01-16 20:41:20 +090012611 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012612}
12613
12614
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012616unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012618 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
Serhiy Storchaka05997252013-01-26 12:14:02 +020012621 if (len < 1)
12622 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
Victor Stinnerc4b49542011-12-11 22:44:26 +010012624 /* no repeat, return original string */
12625 if (len == 1)
12626 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012627
Benjamin Petersonbac79492012-01-14 13:34:47 -050012628 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 return NULL;
12630
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012631 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012632 PyErr_SetString(PyExc_OverflowError,
12633 "repeated string is too long");
12634 return NULL;
12635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012637
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012638 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639 if (!u)
12640 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012641 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 if (PyUnicode_GET_LENGTH(str) == 1) {
12644 const int kind = PyUnicode_KIND(str);
12645 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012646 if (kind == PyUnicode_1BYTE_KIND) {
12647 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012648 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012649 }
12650 else if (kind == PyUnicode_2BYTE_KIND) {
12651 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012652 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012653 ucs2[n] = fill_char;
12654 } else {
12655 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12656 assert(kind == PyUnicode_4BYTE_KIND);
12657 for (n = 0; n < len; ++n)
12658 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 }
12661 else {
12662 /* number of characters copied this far */
12663 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012664 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012666 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012670 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012671 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673 }
12674
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012675 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012676 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677}
12678
Alexander Belopolsky40018472011-02-26 01:02:56 +000012679PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012680PyUnicode_Replace(PyObject *str,
12681 PyObject *substr,
12682 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012683 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012685 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12686 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012688 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
INADA Naoki3ae20562017-01-16 20:41:20 +090012691/*[clinic input]
12692str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693
INADA Naoki3ae20562017-01-16 20:41:20 +090012694 old: unicode
12695 new: unicode
12696 count: Py_ssize_t = -1
12697 Maximum number of occurrences to replace.
12698 -1 (the default value) means replace all occurrences.
12699 /
12700
12701Return a copy with all occurrences of substring old replaced by new.
12702
12703If the optional argument count is given, only the first count occurrences are
12704replaced.
12705[clinic start generated code]*/
12706
12707static PyObject *
12708unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12709 Py_ssize_t count)
12710/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012712 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012714 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715}
12716
Alexander Belopolsky40018472011-02-26 01:02:56 +000012717static PyObject *
12718unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012720 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 Py_ssize_t isize;
12722 Py_ssize_t osize, squote, dquote, i, o;
12723 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012724 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012728 return NULL;
12729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 isize = PyUnicode_GET_LENGTH(unicode);
12731 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 /* Compute length of output, quote characters, and
12734 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012735 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 max = 127;
12737 squote = dquote = 0;
12738 ikind = PyUnicode_KIND(unicode);
12739 for (i = 0; i < isize; i++) {
12740 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012741 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012743 case '\'': squote++; break;
12744 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012746 incr = 2;
12747 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 default:
12749 /* Fast-path ASCII */
12750 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012751 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012753 ;
12754 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012757 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012759 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012761 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012763 if (osize > PY_SSIZE_T_MAX - incr) {
12764 PyErr_SetString(PyExc_OverflowError,
12765 "string is too long to generate repr");
12766 return NULL;
12767 }
12768 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 }
12770
12771 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012772 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012774 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 if (dquote)
12776 /* Both squote and dquote present. Use squote,
12777 and escape them */
12778 osize += squote;
12779 else
12780 quote = '"';
12781 }
Victor Stinner55c08782013-04-14 18:45:39 +020012782 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783
12784 repr = PyUnicode_New(osize, max);
12785 if (repr == NULL)
12786 return NULL;
12787 okind = PyUnicode_KIND(repr);
12788 odata = PyUnicode_DATA(repr);
12789
12790 PyUnicode_WRITE(okind, odata, 0, quote);
12791 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012792 if (unchanged) {
12793 _PyUnicode_FastCopyCharacters(repr, 1,
12794 unicode, 0,
12795 isize);
12796 }
12797 else {
12798 for (i = 0, o = 1; i < isize; i++) {
12799 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800
Victor Stinner55c08782013-04-14 18:45:39 +020012801 /* Escape quotes and backslashes */
12802 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012803 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012805 continue;
12806 }
12807
12808 /* Map special whitespace to '\t', \n', '\r' */
12809 if (ch == '\t') {
12810 PyUnicode_WRITE(okind, odata, o++, '\\');
12811 PyUnicode_WRITE(okind, odata, o++, 't');
12812 }
12813 else if (ch == '\n') {
12814 PyUnicode_WRITE(okind, odata, o++, '\\');
12815 PyUnicode_WRITE(okind, odata, o++, 'n');
12816 }
12817 else if (ch == '\r') {
12818 PyUnicode_WRITE(okind, odata, o++, '\\');
12819 PyUnicode_WRITE(okind, odata, o++, 'r');
12820 }
12821
12822 /* Map non-printable US ASCII to '\xhh' */
12823 else if (ch < ' ' || ch == 0x7F) {
12824 PyUnicode_WRITE(okind, odata, o++, '\\');
12825 PyUnicode_WRITE(okind, odata, o++, 'x');
12826 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12827 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12828 }
12829
12830 /* Copy ASCII characters as-is */
12831 else if (ch < 0x7F) {
12832 PyUnicode_WRITE(okind, odata, o++, ch);
12833 }
12834
12835 /* Non-ASCII characters */
12836 else {
12837 /* Map Unicode whitespace and control characters
12838 (categories Z* and C* except ASCII space)
12839 */
12840 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12841 PyUnicode_WRITE(okind, odata, o++, '\\');
12842 /* Map 8-bit characters to '\xhh' */
12843 if (ch <= 0xff) {
12844 PyUnicode_WRITE(okind, odata, o++, 'x');
12845 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12846 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12847 }
12848 /* Map 16-bit characters to '\uxxxx' */
12849 else if (ch <= 0xffff) {
12850 PyUnicode_WRITE(okind, odata, o++, 'u');
12851 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12852 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12853 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12854 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12855 }
12856 /* Map 21-bit characters to '\U00xxxxxx' */
12857 else {
12858 PyUnicode_WRITE(okind, odata, o++, 'U');
12859 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12860 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12861 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12862 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12863 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12864 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12865 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12866 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12867 }
12868 }
12869 /* Copy characters as-is */
12870 else {
12871 PyUnicode_WRITE(okind, odata, o++, ch);
12872 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012873 }
12874 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012877 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012878 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879}
12880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012881PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883\n\
12884Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012885such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886arguments start and end are interpreted as in slice notation.\n\
12887\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012888Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889
12890static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012893 /* initialize variables to prevent gcc warning */
12894 PyObject *substring = NULL;
12895 Py_ssize_t start = 0;
12896 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012897 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012899 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012902 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012905 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 if (result == -2)
12908 return NULL;
12909
Christian Heimes217cfd12007-12-02 14:31:20 +000012910 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911}
12912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012913PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012914 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012916Return the highest index in S where substring sub is found,\n\
12917such that sub is contained within S[start:end]. Optional\n\
12918arguments start and end are interpreted as in slice notation.\n\
12919\n\
12920Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921
12922static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012925 /* initialize variables to prevent gcc warning */
12926 PyObject *substring = NULL;
12927 Py_ssize_t start = 0;
12928 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012929 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012931 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012934 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012937 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 if (result == -2)
12940 return NULL;
12941
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942 if (result < 0) {
12943 PyErr_SetString(PyExc_ValueError, "substring not found");
12944 return NULL;
12945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946
Christian Heimes217cfd12007-12-02 14:31:20 +000012947 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948}
12949
INADA Naoki3ae20562017-01-16 20:41:20 +090012950/*[clinic input]
12951str.rjust as unicode_rjust
12952
12953 width: Py_ssize_t
12954 fillchar: Py_UCS4 = ' '
12955 /
12956
12957Return a right-justified string of length width.
12958
12959Padding is done using the specified fill character (default is a space).
12960[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961
12962static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012963unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12964/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012966 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967 return NULL;
12968
Victor Stinnerc4b49542011-12-11 22:44:26 +010012969 if (PyUnicode_GET_LENGTH(self) >= width)
12970 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971
Victor Stinnerc4b49542011-12-11 22:44:26 +010012972 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973}
12974
Alexander Belopolsky40018472011-02-26 01:02:56 +000012975PyObject *
12976PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012978 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012981 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012982}
12983
INADA Naoki3ae20562017-01-16 20:41:20 +090012984/*[clinic input]
12985str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986
INADA Naoki3ae20562017-01-16 20:41:20 +090012987 sep: object = None
12988 The delimiter according which to split the string.
12989 None (the default value) means split according to any whitespace,
12990 and discard empty strings from the result.
12991 maxsplit: Py_ssize_t = -1
12992 Maximum number of splits to do.
12993 -1 (the default value) means no limit.
12994
12995Return a list of the words in the string, using sep as the delimiter string.
12996[clinic start generated code]*/
12997
12998static PyObject *
12999unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13000/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001{
INADA Naoki3ae20562017-01-16 20:41:20 +090013002 if (sep == Py_None)
13003 return split(self, NULL, maxsplit);
13004 if (PyUnicode_Check(sep))
13005 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013006
Victor Stinner998b8062018-09-12 00:23:25 +020013007 PyErr_Format(PyExc_TypeError,
13008 "must be str or None, not %.100s",
13009 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011}
13012
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013014PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013015{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013016 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013017 int kind1, kind2;
13018 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013020
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013021 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013022 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013023
Victor Stinner14f8f022011-10-05 20:58:25 +020013024 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 len1 = PyUnicode_GET_LENGTH(str_obj);
13027 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013028 if (kind1 < kind2 || len1 < len2) {
13029 _Py_INCREF_UNICODE_EMPTY();
13030 if (!unicode_empty)
13031 out = NULL;
13032 else {
13033 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13034 Py_DECREF(unicode_empty);
13035 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013036 return out;
13037 }
13038 buf1 = PyUnicode_DATA(str_obj);
13039 buf2 = PyUnicode_DATA(sep_obj);
13040 if (kind2 != kind1) {
13041 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13042 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013043 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013046 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013048 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13049 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13050 else
13051 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013052 break;
13053 case PyUnicode_2BYTE_KIND:
13054 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13055 break;
13056 case PyUnicode_4BYTE_KIND:
13057 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13058 break;
13059 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013060 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013063 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013064 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013065
13066 return out;
13067}
13068
13069
13070PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013071PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013072{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013073 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013074 int kind1, kind2;
13075 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013077
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013078 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013080
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013081 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 len1 = PyUnicode_GET_LENGTH(str_obj);
13084 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013085 if (kind1 < kind2 || len1 < len2) {
13086 _Py_INCREF_UNICODE_EMPTY();
13087 if (!unicode_empty)
13088 out = NULL;
13089 else {
13090 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13091 Py_DECREF(unicode_empty);
13092 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013093 return out;
13094 }
13095 buf1 = PyUnicode_DATA(str_obj);
13096 buf2 = PyUnicode_DATA(sep_obj);
13097 if (kind2 != kind1) {
13098 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13099 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013100 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013103 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013105 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13106 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13107 else
13108 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 break;
13110 case PyUnicode_2BYTE_KIND:
13111 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13112 break;
13113 case PyUnicode_4BYTE_KIND:
13114 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13115 break;
13116 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013117 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013119
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013120 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013122
13123 return out;
13124}
13125
INADA Naoki3ae20562017-01-16 20:41:20 +090013126/*[clinic input]
13127str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013128
INADA Naoki3ae20562017-01-16 20:41:20 +090013129 sep: object
13130 /
13131
13132Partition the string into three parts using the given separator.
13133
13134This will search for the separator in the string. If the separator is found,
13135returns a 3-tuple containing the part before the separator, the separator
13136itself, and the part after it.
13137
13138If the separator is not found, returns a 3-tuple containing the original string
13139and two empty strings.
13140[clinic start generated code]*/
13141
13142static PyObject *
13143unicode_partition(PyObject *self, PyObject *sep)
13144/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013145{
INADA Naoki3ae20562017-01-16 20:41:20 +090013146 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013147}
13148
INADA Naoki3ae20562017-01-16 20:41:20 +090013149/*[clinic input]
13150str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013151
INADA Naoki3ae20562017-01-16 20:41:20 +090013152Partition the string into three parts using the given separator.
13153
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013154This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013155the separator is found, returns a 3-tuple containing the part before the
13156separator, the separator itself, and the part after it.
13157
13158If the separator is not found, returns a 3-tuple containing two empty strings
13159and the original string.
13160[clinic start generated code]*/
13161
13162static PyObject *
13163unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013164/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013165{
INADA Naoki3ae20562017-01-16 20:41:20 +090013166 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013167}
13168
Alexander Belopolsky40018472011-02-26 01:02:56 +000013169PyObject *
13170PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013171{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013172 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013174
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013175 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013176}
13177
INADA Naoki3ae20562017-01-16 20:41:20 +090013178/*[clinic input]
13179str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013180
INADA Naoki3ae20562017-01-16 20:41:20 +090013181Return a list of the words in the string, using sep as the delimiter string.
13182
13183Splits are done starting at the end of the string and working to the front.
13184[clinic start generated code]*/
13185
13186static PyObject *
13187unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13188/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013189{
INADA Naoki3ae20562017-01-16 20:41:20 +090013190 if (sep == Py_None)
13191 return rsplit(self, NULL, maxsplit);
13192 if (PyUnicode_Check(sep))
13193 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013194
Victor Stinner998b8062018-09-12 00:23:25 +020013195 PyErr_Format(PyExc_TypeError,
13196 "must be str or None, not %.100s",
13197 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013198 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013199}
13200
INADA Naoki3ae20562017-01-16 20:41:20 +090013201/*[clinic input]
13202str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013204 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013205
13206Return a list of the lines in the string, breaking at line boundaries.
13207
13208Line breaks are not included in the resulting list unless keepends is given and
13209true.
13210[clinic start generated code]*/
13211
13212static PyObject *
13213unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013214/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013216 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217}
13218
13219static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013220PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013222 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223}
13224
INADA Naoki3ae20562017-01-16 20:41:20 +090013225/*[clinic input]
13226str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227
INADA Naoki3ae20562017-01-16 20:41:20 +090013228Convert uppercase characters to lowercase and lowercase characters to uppercase.
13229[clinic start generated code]*/
13230
13231static PyObject *
13232unicode_swapcase_impl(PyObject *self)
13233/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013235 if (PyUnicode_READY(self) == -1)
13236 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013237 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238}
13239
Larry Hastings61272b72014-01-07 12:41:53 -080013240/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013241
Larry Hastings31826802013-10-19 00:09:25 -070013242@staticmethod
13243str.maketrans as unicode_maketrans
13244
13245 x: object
13246
13247 y: unicode=NULL
13248
13249 z: unicode=NULL
13250
13251 /
13252
13253Return a translation table usable for str.translate().
13254
13255If there is only one argument, it must be a dictionary mapping Unicode
13256ordinals (integers) or characters to Unicode ordinals, strings or None.
13257Character keys will be then converted to ordinals.
13258If there are two arguments, they must be strings of equal length, and
13259in the resulting dictionary, each character in x will be mapped to the
13260character at the same position in y. If there is a third argument, it
13261must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013262[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013263
Larry Hastings31826802013-10-19 00:09:25 -070013264static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013265unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013266/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013267{
Georg Brandlceee0772007-11-27 23:48:05 +000013268 PyObject *new = NULL, *key, *value;
13269 Py_ssize_t i = 0;
13270 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271
Georg Brandlceee0772007-11-27 23:48:05 +000013272 new = PyDict_New();
13273 if (!new)
13274 return NULL;
13275 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 int x_kind, y_kind, z_kind;
13277 void *x_data, *y_data, *z_data;
13278
Georg Brandlceee0772007-11-27 23:48:05 +000013279 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013280 if (!PyUnicode_Check(x)) {
13281 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13282 "be a string if there is a second argument");
13283 goto err;
13284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013286 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13287 "arguments must have equal length");
13288 goto err;
13289 }
13290 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291 x_kind = PyUnicode_KIND(x);
13292 y_kind = PyUnicode_KIND(y);
13293 x_data = PyUnicode_DATA(x);
13294 y_data = PyUnicode_DATA(y);
13295 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13296 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013297 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013298 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013299 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013300 if (!value) {
13301 Py_DECREF(key);
13302 goto err;
13303 }
Georg Brandlceee0772007-11-27 23:48:05 +000013304 res = PyDict_SetItem(new, key, value);
13305 Py_DECREF(key);
13306 Py_DECREF(value);
13307 if (res < 0)
13308 goto err;
13309 }
13310 /* create entries for deleting chars in z */
13311 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 z_kind = PyUnicode_KIND(z);
13313 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013314 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013316 if (!key)
13317 goto err;
13318 res = PyDict_SetItem(new, key, Py_None);
13319 Py_DECREF(key);
13320 if (res < 0)
13321 goto err;
13322 }
13323 }
13324 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 int kind;
13326 void *data;
13327
Georg Brandlceee0772007-11-27 23:48:05 +000013328 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013329 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013330 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13331 "to maketrans it must be a dict");
13332 goto err;
13333 }
13334 /* copy entries into the new dict, converting string keys to int keys */
13335 while (PyDict_Next(x, &i, &key, &value)) {
13336 if (PyUnicode_Check(key)) {
13337 /* convert string keys to integer keys */
13338 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013339 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013340 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13341 "table must be of length 1");
13342 goto err;
13343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 kind = PyUnicode_KIND(key);
13345 data = PyUnicode_DATA(key);
13346 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013347 if (!newkey)
13348 goto err;
13349 res = PyDict_SetItem(new, newkey, value);
13350 Py_DECREF(newkey);
13351 if (res < 0)
13352 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013353 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013354 /* just keep integer keys */
13355 if (PyDict_SetItem(new, key, value) < 0)
13356 goto err;
13357 } else {
13358 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13359 "be strings or integers");
13360 goto err;
13361 }
13362 }
13363 }
13364 return new;
13365 err:
13366 Py_DECREF(new);
13367 return NULL;
13368}
13369
INADA Naoki3ae20562017-01-16 20:41:20 +090013370/*[clinic input]
13371str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
INADA Naoki3ae20562017-01-16 20:41:20 +090013373 table: object
13374 Translation table, which must be a mapping of Unicode ordinals to
13375 Unicode ordinals, strings, or None.
13376 /
13377
13378Replace each character in the string using the given translation table.
13379
13380The table must implement lookup/indexing via __getitem__, for instance a
13381dictionary or list. If this operation raises LookupError, the character is
13382left untouched. Characters mapped to None are deleted.
13383[clinic start generated code]*/
13384
13385static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013387/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390}
13391
INADA Naoki3ae20562017-01-16 20:41:20 +090013392/*[clinic input]
13393str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394
INADA Naoki3ae20562017-01-16 20:41:20 +090013395Return a copy of the string converted to uppercase.
13396[clinic start generated code]*/
13397
13398static PyObject *
13399unicode_upper_impl(PyObject *self)
13400/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013402 if (PyUnicode_READY(self) == -1)
13403 return NULL;
13404 if (PyUnicode_IS_ASCII(self))
13405 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013406 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407}
13408
INADA Naoki3ae20562017-01-16 20:41:20 +090013409/*[clinic input]
13410str.zfill as unicode_zfill
13411
13412 width: Py_ssize_t
13413 /
13414
13415Pad a numeric string with zeros on the left, to fill a field of the given width.
13416
13417The string is never truncated.
13418[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419
13420static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013421unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013422/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013424 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013425 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 int kind;
13427 void *data;
13428 Py_UCS4 chr;
13429
Benjamin Petersonbac79492012-01-14 13:34:47 -050013430 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432
Victor Stinnerc4b49542011-12-11 22:44:26 +010013433 if (PyUnicode_GET_LENGTH(self) >= width)
13434 return unicode_result_unchanged(self);
13435
13436 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437
13438 u = pad(self, fill, 0, '0');
13439
Walter Dörwald068325e2002-04-15 13:36:47 +000013440 if (u == NULL)
13441 return NULL;
13442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 kind = PyUnicode_KIND(u);
13444 data = PyUnicode_DATA(u);
13445 chr = PyUnicode_READ(kind, data, fill);
13446
13447 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013449 PyUnicode_WRITE(kind, data, 0, chr);
13450 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451 }
13452
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013453 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013454 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013455}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456
13457#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013458static PyObject *
13459unicode__decimal2ascii(PyObject *self)
13460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013462}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463#endif
13464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013465PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013467\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013468Return True if S starts with the specified prefix, False otherwise.\n\
13469With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013470With optional end, stop comparing S at that position.\n\
13471prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013472
13473static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013474unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013477 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013478 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013479 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013480 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013481 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482
Jesus Ceaac451502011-04-20 17:09:23 +020013483 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013484 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013485 if (PyTuple_Check(subobj)) {
13486 Py_ssize_t i;
13487 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013488 substring = PyTuple_GET_ITEM(subobj, i);
13489 if (!PyUnicode_Check(substring)) {
13490 PyErr_Format(PyExc_TypeError,
13491 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013492 "not %.100s",
13493 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013494 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013495 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013496 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013497 if (result == -1)
13498 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013499 if (result) {
13500 Py_RETURN_TRUE;
13501 }
13502 }
13503 /* nothing matched */
13504 Py_RETURN_FALSE;
13505 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013506 if (!PyUnicode_Check(subobj)) {
13507 PyErr_Format(PyExc_TypeError,
13508 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013509 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013511 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013512 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013513 if (result == -1)
13514 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013515 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013516}
13517
13518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013519PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013521\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013522Return True if S ends with the specified suffix, False otherwise.\n\
13523With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013524With optional end, stop comparing S at that position.\n\
13525suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013526
13527static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013528unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013530{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013531 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013532 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013533 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013534 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013535 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013536
Jesus Ceaac451502011-04-20 17:09:23 +020013537 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013539 if (PyTuple_Check(subobj)) {
13540 Py_ssize_t i;
13541 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013542 substring = PyTuple_GET_ITEM(subobj, i);
13543 if (!PyUnicode_Check(substring)) {
13544 PyErr_Format(PyExc_TypeError,
13545 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013546 "not %.100s",
13547 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013549 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013550 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013551 if (result == -1)
13552 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013553 if (result) {
13554 Py_RETURN_TRUE;
13555 }
13556 }
13557 Py_RETURN_FALSE;
13558 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013559 if (!PyUnicode_Check(subobj)) {
13560 PyErr_Format(PyExc_TypeError,
13561 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013562 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013564 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013565 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013566 if (result == -1)
13567 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013568 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569}
13570
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013571static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013572_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013573{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013574 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13575 writer->data = PyUnicode_DATA(writer->buffer);
13576
13577 if (!writer->readonly) {
13578 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013579 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013580 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013581 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013582 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13583 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13584 writer->kind = PyUnicode_WCHAR_KIND;
13585 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13586
Victor Stinner8f674cc2013-04-17 23:02:17 +020013587 /* Copy-on-write mode: set buffer size to 0 so
13588 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13589 * next write. */
13590 writer->size = 0;
13591 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013592}
13593
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013595_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013596{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013597 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013598
13599 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013600 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013601
13602 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13603 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13604 writer->kind = PyUnicode_WCHAR_KIND;
13605 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013606}
13607
Inada Naoki770847a2019-06-24 12:30:24 +090013608// Initialize _PyUnicodeWriter with initial buffer
13609static inline void
13610_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13611{
13612 memset(writer, 0, sizeof(*writer));
13613 writer->buffer = buffer;
13614 _PyUnicodeWriter_Update(writer);
13615 writer->min_length = writer->size;
13616}
13617
Victor Stinnerd3f08822012-05-29 12:57:52 +020013618int
13619_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13620 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013621{
13622 Py_ssize_t newlen;
13623 PyObject *newbuffer;
13624
Victor Stinner2740e462016-09-06 16:58:36 -070013625 assert(maxchar <= MAX_UNICODE);
13626
Victor Stinnerca9381e2015-09-22 00:58:32 +020013627 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013628 assert((maxchar > writer->maxchar && length >= 0)
13629 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013630
Victor Stinner202fdca2012-05-07 12:47:02 +020013631 if (length > PY_SSIZE_T_MAX - writer->pos) {
13632 PyErr_NoMemory();
13633 return -1;
13634 }
13635 newlen = writer->pos + length;
13636
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013637 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013638
Victor Stinnerd3f08822012-05-29 12:57:52 +020013639 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013640 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013641 if (writer->overallocate
13642 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13643 /* overallocate to limit the number of realloc() */
13644 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013645 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013646 if (newlen < writer->min_length)
13647 newlen = writer->min_length;
13648
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 writer->buffer = PyUnicode_New(newlen, maxchar);
13650 if (writer->buffer == NULL)
13651 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013653 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013654 if (writer->overallocate
13655 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13656 /* overallocate to limit the number of realloc() */
13657 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013659 if (newlen < writer->min_length)
13660 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013661
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013662 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013663 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013664 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013665 newbuffer = PyUnicode_New(newlen, maxchar);
13666 if (newbuffer == NULL)
13667 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013668 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13669 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013670 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013671 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013672 }
13673 else {
13674 newbuffer = resize_compact(writer->buffer, newlen);
13675 if (newbuffer == NULL)
13676 return -1;
13677 }
13678 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013679 }
13680 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013681 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013682 newbuffer = PyUnicode_New(writer->size, maxchar);
13683 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013684 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013685 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13686 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013687 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013688 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013689 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013690 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013691
13692#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013693}
13694
Victor Stinnerca9381e2015-09-22 00:58:32 +020013695int
13696_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13697 enum PyUnicode_Kind kind)
13698{
13699 Py_UCS4 maxchar;
13700
13701 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13702 assert(writer->kind < kind);
13703
13704 switch (kind)
13705 {
13706 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13707 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13708 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13709 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013710 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013711 }
13712
13713 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13714}
13715
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013716static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013717_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013718{
Victor Stinner2740e462016-09-06 16:58:36 -070013719 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013720 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13721 return -1;
13722 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13723 writer->pos++;
13724 return 0;
13725}
13726
13727int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013728_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13729{
13730 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13731}
13732
13733int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013734_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13735{
13736 Py_UCS4 maxchar;
13737 Py_ssize_t len;
13738
13739 if (PyUnicode_READY(str) == -1)
13740 return -1;
13741 len = PyUnicode_GET_LENGTH(str);
13742 if (len == 0)
13743 return 0;
13744 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13745 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013746 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013747 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013748 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013749 Py_INCREF(str);
13750 writer->buffer = str;
13751 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013752 writer->pos += len;
13753 return 0;
13754 }
13755 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13756 return -1;
13757 }
13758 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13759 str, 0, len);
13760 writer->pos += len;
13761 return 0;
13762}
13763
Victor Stinnere215d962012-10-06 23:03:36 +020013764int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013765_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13766 Py_ssize_t start, Py_ssize_t end)
13767{
13768 Py_UCS4 maxchar;
13769 Py_ssize_t len;
13770
13771 if (PyUnicode_READY(str) == -1)
13772 return -1;
13773
13774 assert(0 <= start);
13775 assert(end <= PyUnicode_GET_LENGTH(str));
13776 assert(start <= end);
13777
13778 if (end == 0)
13779 return 0;
13780
13781 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13782 return _PyUnicodeWriter_WriteStr(writer, str);
13783
13784 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13785 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13786 else
13787 maxchar = writer->maxchar;
13788 len = end - start;
13789
13790 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13791 return -1;
13792
13793 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13794 str, start, len);
13795 writer->pos += len;
13796 return 0;
13797}
13798
13799int
Victor Stinner4a587072013-11-19 12:54:53 +010013800_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13801 const char *ascii, Py_ssize_t len)
13802{
13803 if (len == -1)
13804 len = strlen(ascii);
13805
13806 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13807
13808 if (writer->buffer == NULL && !writer->overallocate) {
13809 PyObject *str;
13810
13811 str = _PyUnicode_FromASCII(ascii, len);
13812 if (str == NULL)
13813 return -1;
13814
13815 writer->readonly = 1;
13816 writer->buffer = str;
13817 _PyUnicodeWriter_Update(writer);
13818 writer->pos += len;
13819 return 0;
13820 }
13821
13822 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13823 return -1;
13824
13825 switch (writer->kind)
13826 {
13827 case PyUnicode_1BYTE_KIND:
13828 {
13829 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13830 Py_UCS1 *data = writer->data;
13831
Christian Heimesf051e432016-09-13 20:22:02 +020013832 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013833 break;
13834 }
13835 case PyUnicode_2BYTE_KIND:
13836 {
13837 _PyUnicode_CONVERT_BYTES(
13838 Py_UCS1, Py_UCS2,
13839 ascii, ascii + len,
13840 (Py_UCS2 *)writer->data + writer->pos);
13841 break;
13842 }
13843 case PyUnicode_4BYTE_KIND:
13844 {
13845 _PyUnicode_CONVERT_BYTES(
13846 Py_UCS1, Py_UCS4,
13847 ascii, ascii + len,
13848 (Py_UCS4 *)writer->data + writer->pos);
13849 break;
13850 }
13851 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013852 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013853 }
13854
13855 writer->pos += len;
13856 return 0;
13857}
13858
13859int
13860_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13861 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013862{
13863 Py_UCS4 maxchar;
13864
13865 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13866 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13867 return -1;
13868 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13869 writer->pos += len;
13870 return 0;
13871}
13872
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013874_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013875{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013876 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013877
Victor Stinnerd3f08822012-05-29 12:57:52 +020013878 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013879 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013880 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013881 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013882
13883 str = writer->buffer;
13884 writer->buffer = NULL;
13885
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013886 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013887 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13888 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013889 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013890
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013891 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13892 PyObject *str2;
13893 str2 = resize_compact(str, writer->pos);
13894 if (str2 == NULL) {
13895 Py_DECREF(str);
13896 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013897 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013898 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013899 }
13900
Victor Stinner15a0bd32013-07-08 22:29:55 +020013901 assert(_PyUnicode_CheckConsistency(str, 1));
13902 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013903}
13904
Victor Stinnerd3f08822012-05-29 12:57:52 +020013905void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013906_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013907{
13908 Py_CLEAR(writer->buffer);
13909}
13910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013911#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013912
13913PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013914 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013915\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013916Return a formatted version of S, using substitutions from args and kwargs.\n\
13917The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013918
Eric Smith27bbca62010-11-04 17:06:58 +000013919PyDoc_STRVAR(format_map__doc__,
13920 "S.format_map(mapping) -> str\n\
13921\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013922Return a formatted version of S, using substitutions from mapping.\n\
13923The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013924
INADA Naoki3ae20562017-01-16 20:41:20 +090013925/*[clinic input]
13926str.__format__ as unicode___format__
13927
13928 format_spec: unicode
13929 /
13930
13931Return a formatted version of the string as described by format_spec.
13932[clinic start generated code]*/
13933
Eric Smith4a7d76d2008-05-30 18:10:19 +000013934static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013935unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013936/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013937{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938 _PyUnicodeWriter writer;
13939 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013940
Victor Stinnerd3f08822012-05-29 12:57:52 +020013941 if (PyUnicode_READY(self) == -1)
13942 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013943 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013944 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13945 self, format_spec, 0,
13946 PyUnicode_GET_LENGTH(format_spec));
13947 if (ret == -1) {
13948 _PyUnicodeWriter_Dealloc(&writer);
13949 return NULL;
13950 }
13951 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013952}
13953
INADA Naoki3ae20562017-01-16 20:41:20 +090013954/*[clinic input]
13955str.__sizeof__ as unicode_sizeof
13956
13957Return the size of the string in memory, in bytes.
13958[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013959
13960static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013961unicode_sizeof_impl(PyObject *self)
13962/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013964 Py_ssize_t size;
13965
13966 /* If it's a compact object, account for base structure +
13967 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013968 if (PyUnicode_IS_COMPACT_ASCII(self))
13969 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13970 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013971 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013972 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013973 else {
13974 /* If it is a two-block object, account for base object, and
13975 for character block if present. */
13976 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013977 if (_PyUnicode_DATA_ANY(self))
13978 size += (PyUnicode_GET_LENGTH(self) + 1) *
13979 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013980 }
13981 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013982 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013983 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13984 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13985 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13986 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013987
13988 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013989}
13990
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013991static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013992unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013993{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013994 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013995 if (!copy)
13996 return NULL;
13997 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013998}
13999
Guido van Rossumd57fd912000-03-10 22:53:23 +000014000static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014001 UNICODE_ENCODE_METHODDEF
14002 UNICODE_REPLACE_METHODDEF
14003 UNICODE_SPLIT_METHODDEF
14004 UNICODE_RSPLIT_METHODDEF
14005 UNICODE_JOIN_METHODDEF
14006 UNICODE_CAPITALIZE_METHODDEF
14007 UNICODE_CASEFOLD_METHODDEF
14008 UNICODE_TITLE_METHODDEF
14009 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014010 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014011 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014012 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014013 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014014 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014015 UNICODE_LJUST_METHODDEF
14016 UNICODE_LOWER_METHODDEF
14017 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014018 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14019 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014020 UNICODE_RJUST_METHODDEF
14021 UNICODE_RSTRIP_METHODDEF
14022 UNICODE_RPARTITION_METHODDEF
14023 UNICODE_SPLITLINES_METHODDEF
14024 UNICODE_STRIP_METHODDEF
14025 UNICODE_SWAPCASE_METHODDEF
14026 UNICODE_TRANSLATE_METHODDEF
14027 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014028 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14029 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014030 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014031 UNICODE_ISLOWER_METHODDEF
14032 UNICODE_ISUPPER_METHODDEF
14033 UNICODE_ISTITLE_METHODDEF
14034 UNICODE_ISSPACE_METHODDEF
14035 UNICODE_ISDECIMAL_METHODDEF
14036 UNICODE_ISDIGIT_METHODDEF
14037 UNICODE_ISNUMERIC_METHODDEF
14038 UNICODE_ISALPHA_METHODDEF
14039 UNICODE_ISALNUM_METHODDEF
14040 UNICODE_ISIDENTIFIER_METHODDEF
14041 UNICODE_ISPRINTABLE_METHODDEF
14042 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014043 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014044 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014045 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014046 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014047 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014048#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014049 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014050 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014051#endif
14052
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014053 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014054 {NULL, NULL}
14055};
14056
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014057static PyObject *
14058unicode_mod(PyObject *v, PyObject *w)
14059{
Brian Curtindfc80e32011-08-10 20:28:54 -050014060 if (!PyUnicode_Check(v))
14061 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014062 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014063}
14064
14065static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 0, /*nb_add*/
14067 0, /*nb_subtract*/
14068 0, /*nb_multiply*/
14069 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014070};
14071
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 (lenfunc) unicode_length, /* sq_length */
14074 PyUnicode_Concat, /* sq_concat */
14075 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14076 (ssizeargfunc) unicode_getitem, /* sq_item */
14077 0, /* sq_slice */
14078 0, /* sq_ass_item */
14079 0, /* sq_ass_slice */
14080 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081};
14082
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014083static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014084unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014086 if (PyUnicode_READY(self) == -1)
14087 return NULL;
14088
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014089 if (PyIndex_Check(item)) {
14090 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014091 if (i == -1 && PyErr_Occurred())
14092 return NULL;
14093 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014094 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014095 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014096 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014097 Py_ssize_t start, stop, step, slicelength, i;
14098 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014099 PyObject *result;
14100 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014101 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014102 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014103
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014104 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014105 return NULL;
14106 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014107 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14108 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014109
14110 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014111 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014112 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014113 slicelength == PyUnicode_GET_LENGTH(self)) {
14114 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014115 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014116 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014117 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014118 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014119 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014120 src_kind = PyUnicode_KIND(self);
14121 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014122 if (!PyUnicode_IS_ASCII(self)) {
14123 kind_limit = kind_maxchar_limit(src_kind);
14124 max_char = 0;
14125 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14126 ch = PyUnicode_READ(src_kind, src_data, cur);
14127 if (ch > max_char) {
14128 max_char = ch;
14129 if (max_char >= kind_limit)
14130 break;
14131 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014132 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014133 }
Victor Stinner55c99112011-10-13 01:17:06 +020014134 else
14135 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014136 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014137 if (result == NULL)
14138 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014139 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014140 dest_data = PyUnicode_DATA(result);
14141
14142 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014143 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14144 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014145 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014146 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014147 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014148 } else {
14149 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14150 return NULL;
14151 }
14152}
14153
14154static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 (lenfunc)unicode_length, /* mp_length */
14156 (binaryfunc)unicode_subscript, /* mp_subscript */
14157 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014158};
14159
Guido van Rossumd57fd912000-03-10 22:53:23 +000014160
Guido van Rossumd57fd912000-03-10 22:53:23 +000014161/* Helpers for PyUnicode_Format() */
14162
Victor Stinnera47082312012-10-04 02:19:54 +020014163struct unicode_formatter_t {
14164 PyObject *args;
14165 int args_owned;
14166 Py_ssize_t arglen, argidx;
14167 PyObject *dict;
14168
14169 enum PyUnicode_Kind fmtkind;
14170 Py_ssize_t fmtcnt, fmtpos;
14171 void *fmtdata;
14172 PyObject *fmtstr;
14173
14174 _PyUnicodeWriter writer;
14175};
14176
14177struct unicode_format_arg_t {
14178 Py_UCS4 ch;
14179 int flags;
14180 Py_ssize_t width;
14181 int prec;
14182 int sign;
14183};
14184
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014186unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014187{
Victor Stinnera47082312012-10-04 02:19:54 +020014188 Py_ssize_t argidx = ctx->argidx;
14189
14190 if (argidx < ctx->arglen) {
14191 ctx->argidx++;
14192 if (ctx->arglen < 0)
14193 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014194 else
Victor Stinnera47082312012-10-04 02:19:54 +020014195 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196 }
14197 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014198 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199 return NULL;
14200}
14201
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014202/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014203
Victor Stinnera47082312012-10-04 02:19:54 +020014204/* Format a float into the writer if the writer is not NULL, or into *p_output
14205 otherwise.
14206
14207 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014208static int
Victor Stinnera47082312012-10-04 02:19:54 +020014209formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14210 PyObject **p_output,
14211 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014212{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014213 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014214 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014215 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014216 int prec;
14217 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014218
Guido van Rossumd57fd912000-03-10 22:53:23 +000014219 x = PyFloat_AsDouble(v);
14220 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014221 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014222
Victor Stinnera47082312012-10-04 02:19:54 +020014223 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014224 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014225 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014226
Victor Stinnera47082312012-10-04 02:19:54 +020014227 if (arg->flags & F_ALT)
14228 dtoa_flags = Py_DTSF_ALT;
14229 else
14230 dtoa_flags = 0;
14231 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014232 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014233 return -1;
14234 len = strlen(p);
14235 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014236 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014237 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014238 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014239 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014240 }
14241 else
14242 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014243 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014244 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014245}
14246
Victor Stinnerd0880d52012-04-27 23:40:13 +020014247/* formatlong() emulates the format codes d, u, o, x and X, and
14248 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14249 * Python's regular ints.
14250 * Return value: a new PyUnicodeObject*, or NULL if error.
14251 * The output string is of the form
14252 * "-"? ("0x" | "0X")? digit+
14253 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14254 * set in flags. The case of hex digits will be correct,
14255 * There will be at least prec digits, zero-filled on the left if
14256 * necessary to get that many.
14257 * val object to be converted
14258 * flags bitmask of format flags; only F_ALT is looked at
14259 * prec minimum number of digits; 0-fill on left if needed
14260 * type a character in [duoxX]; u acts the same as d
14261 *
14262 * CAUTION: o, x and X conversions on regular ints can never
14263 * produce a '-' sign, but can for Python's unbounded ints.
14264 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014265PyObject *
14266_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014267{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014268 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014270 Py_ssize_t i;
14271 int sign; /* 1 if '-', else 0 */
14272 int len; /* number of characters */
14273 Py_ssize_t llen;
14274 int numdigits; /* len == numnondigits + numdigits */
14275 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014276
Victor Stinnerd0880d52012-04-27 23:40:13 +020014277 /* Avoid exceeding SSIZE_T_MAX */
14278 if (prec > INT_MAX-3) {
14279 PyErr_SetString(PyExc_OverflowError,
14280 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014282 }
14283
14284 assert(PyLong_Check(val));
14285
14286 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014288 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014289 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014290 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014291 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014292 /* int and int subclasses should print numerically when a numeric */
14293 /* format code is used (see issue18780) */
14294 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014295 break;
14296 case 'o':
14297 numnondigits = 2;
14298 result = PyNumber_ToBase(val, 8);
14299 break;
14300 case 'x':
14301 case 'X':
14302 numnondigits = 2;
14303 result = PyNumber_ToBase(val, 16);
14304 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014305 }
14306 if (!result)
14307 return NULL;
14308
14309 assert(unicode_modifiable(result));
14310 assert(PyUnicode_IS_READY(result));
14311 assert(PyUnicode_IS_ASCII(result));
14312
14313 /* To modify the string in-place, there can only be one reference. */
14314 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014315 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014316 PyErr_BadInternalCall();
14317 return NULL;
14318 }
14319 buf = PyUnicode_DATA(result);
14320 llen = PyUnicode_GET_LENGTH(result);
14321 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014322 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014323 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014324 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014325 return NULL;
14326 }
14327 len = (int)llen;
14328 sign = buf[0] == '-';
14329 numnondigits += sign;
14330 numdigits = len - numnondigits;
14331 assert(numdigits > 0);
14332
14333 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014334 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014335 (type == 'o' || type == 'x' || type == 'X'))) {
14336 assert(buf[sign] == '0');
14337 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14338 buf[sign+1] == 'o');
14339 numnondigits -= 2;
14340 buf += 2;
14341 len -= 2;
14342 if (sign)
14343 buf[0] = '-';
14344 assert(len == numnondigits + numdigits);
14345 assert(numdigits > 0);
14346 }
14347
14348 /* Fill with leading zeroes to meet minimum width. */
14349 if (prec > numdigits) {
14350 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14351 numnondigits + prec);
14352 char *b1;
14353 if (!r1) {
14354 Py_DECREF(result);
14355 return NULL;
14356 }
14357 b1 = PyBytes_AS_STRING(r1);
14358 for (i = 0; i < numnondigits; ++i)
14359 *b1++ = *buf++;
14360 for (i = 0; i < prec - numdigits; i++)
14361 *b1++ = '0';
14362 for (i = 0; i < numdigits; i++)
14363 *b1++ = *buf++;
14364 *b1 = '\0';
14365 Py_DECREF(result);
14366 result = r1;
14367 buf = PyBytes_AS_STRING(result);
14368 len = numnondigits + prec;
14369 }
14370
14371 /* Fix up case for hex conversions. */
14372 if (type == 'X') {
14373 /* Need to convert all lower case letters to upper case.
14374 and need to convert 0x to 0X (and -0x to -0X). */
14375 for (i = 0; i < len; i++)
14376 if (buf[i] >= 'a' && buf[i] <= 'x')
14377 buf[i] -= 'a'-'A';
14378 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379 if (!PyUnicode_Check(result)
14380 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014381 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014382 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014383 Py_DECREF(result);
14384 result = unicode;
14385 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014386 else if (len != PyUnicode_GET_LENGTH(result)) {
14387 if (PyUnicode_Resize(&result, len) < 0)
14388 Py_CLEAR(result);
14389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014390 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014391}
14392
Ethan Furmandf3ed242014-01-05 06:50:30 -080014393/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014394 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014395 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014396 * -1 and raise an exception on error */
14397static int
Victor Stinnera47082312012-10-04 02:19:54 +020014398mainformatlong(PyObject *v,
14399 struct unicode_format_arg_t *arg,
14400 PyObject **p_output,
14401 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014402{
14403 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014404 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014405
14406 if (!PyNumber_Check(v))
14407 goto wrongtype;
14408
Ethan Furman9ab74802014-03-21 06:38:46 -070014409 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014410 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014411 if (type == 'o' || type == 'x' || type == 'X') {
14412 iobj = PyNumber_Index(v);
14413 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014414 if (PyErr_ExceptionMatches(PyExc_TypeError))
14415 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014416 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014417 }
14418 }
14419 else {
14420 iobj = PyNumber_Long(v);
14421 if (iobj == NULL ) {
14422 if (PyErr_ExceptionMatches(PyExc_TypeError))
14423 goto wrongtype;
14424 return -1;
14425 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014426 }
14427 assert(PyLong_Check(iobj));
14428 }
14429 else {
14430 iobj = v;
14431 Py_INCREF(iobj);
14432 }
14433
14434 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014435 && arg->width == -1 && arg->prec == -1
14436 && !(arg->flags & (F_SIGN | F_BLANK))
14437 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014438 {
14439 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014440 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014441 int base;
14442
Victor Stinnera47082312012-10-04 02:19:54 +020014443 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014444 {
14445 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014446 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014447 case 'd':
14448 case 'i':
14449 case 'u':
14450 base = 10;
14451 break;
14452 case 'o':
14453 base = 8;
14454 break;
14455 case 'x':
14456 case 'X':
14457 base = 16;
14458 break;
14459 }
14460
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014461 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14462 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014463 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014464 }
14465 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014466 return 1;
14467 }
14468
Ethan Furmanb95b5612015-01-23 20:05:18 -080014469 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014470 Py_DECREF(iobj);
14471 if (res == NULL)
14472 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014473 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014474 return 0;
14475
14476wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014477 switch(type)
14478 {
14479 case 'o':
14480 case 'x':
14481 case 'X':
14482 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014483 "%%%c format: an integer is required, "
14484 "not %.200s",
14485 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014486 break;
14487 default:
14488 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014489 "%%%c format: a number is required, "
14490 "not %.200s",
14491 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014492 break;
14493 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014494 return -1;
14495}
14496
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014497static Py_UCS4
14498formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014499{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014500 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014501 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014502 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014503 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014504 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014505 goto onError;
14506 }
14507 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014508 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014509 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014510 /* make sure number is a type of integer */
14511 if (!PyLong_Check(v)) {
14512 iobj = PyNumber_Index(v);
14513 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014514 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014515 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014516 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014517 Py_DECREF(iobj);
14518 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014519 else {
14520 x = PyLong_AsLong(v);
14521 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014522 if (x == -1 && PyErr_Occurred())
14523 goto onError;
14524
Victor Stinner8faf8212011-12-08 22:14:11 +010014525 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014526 PyErr_SetString(PyExc_OverflowError,
14527 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014528 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014529 }
14530
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014531 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014532 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014533
Benjamin Peterson29060642009-01-31 22:14:21 +000014534 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014535 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014536 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014537 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014538}
14539
Victor Stinnera47082312012-10-04 02:19:54 +020014540/* Parse options of an argument: flags, width, precision.
14541 Handle also "%(name)" syntax.
14542
14543 Return 0 if the argument has been formatted into arg->str.
14544 Return 1 if the argument has been written into ctx->writer,
14545 Raise an exception and return -1 on error. */
14546static int
14547unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14548 struct unicode_format_arg_t *arg)
14549{
14550#define FORMAT_READ(ctx) \
14551 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14552
14553 PyObject *v;
14554
Victor Stinnera47082312012-10-04 02:19:54 +020014555 if (arg->ch == '(') {
14556 /* Get argument value from a dictionary. Example: "%(name)s". */
14557 Py_ssize_t keystart;
14558 Py_ssize_t keylen;
14559 PyObject *key;
14560 int pcount = 1;
14561
14562 if (ctx->dict == NULL) {
14563 PyErr_SetString(PyExc_TypeError,
14564 "format requires a mapping");
14565 return -1;
14566 }
14567 ++ctx->fmtpos;
14568 --ctx->fmtcnt;
14569 keystart = ctx->fmtpos;
14570 /* Skip over balanced parentheses */
14571 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14572 arg->ch = FORMAT_READ(ctx);
14573 if (arg->ch == ')')
14574 --pcount;
14575 else if (arg->ch == '(')
14576 ++pcount;
14577 ctx->fmtpos++;
14578 }
14579 keylen = ctx->fmtpos - keystart - 1;
14580 if (ctx->fmtcnt < 0 || pcount > 0) {
14581 PyErr_SetString(PyExc_ValueError,
14582 "incomplete format key");
14583 return -1;
14584 }
14585 key = PyUnicode_Substring(ctx->fmtstr,
14586 keystart, keystart + keylen);
14587 if (key == NULL)
14588 return -1;
14589 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014590 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014591 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014592 }
14593 ctx->args = PyObject_GetItem(ctx->dict, key);
14594 Py_DECREF(key);
14595 if (ctx->args == NULL)
14596 return -1;
14597 ctx->args_owned = 1;
14598 ctx->arglen = -1;
14599 ctx->argidx = -2;
14600 }
14601
14602 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014603 while (--ctx->fmtcnt >= 0) {
14604 arg->ch = FORMAT_READ(ctx);
14605 ctx->fmtpos++;
14606 switch (arg->ch) {
14607 case '-': arg->flags |= F_LJUST; continue;
14608 case '+': arg->flags |= F_SIGN; continue;
14609 case ' ': arg->flags |= F_BLANK; continue;
14610 case '#': arg->flags |= F_ALT; continue;
14611 case '0': arg->flags |= F_ZERO; continue;
14612 }
14613 break;
14614 }
14615
14616 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014617 if (arg->ch == '*') {
14618 v = unicode_format_getnextarg(ctx);
14619 if (v == NULL)
14620 return -1;
14621 if (!PyLong_Check(v)) {
14622 PyErr_SetString(PyExc_TypeError,
14623 "* wants int");
14624 return -1;
14625 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014626 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014627 if (arg->width == -1 && PyErr_Occurred())
14628 return -1;
14629 if (arg->width < 0) {
14630 arg->flags |= F_LJUST;
14631 arg->width = -arg->width;
14632 }
14633 if (--ctx->fmtcnt >= 0) {
14634 arg->ch = FORMAT_READ(ctx);
14635 ctx->fmtpos++;
14636 }
14637 }
14638 else if (arg->ch >= '0' && arg->ch <= '9') {
14639 arg->width = arg->ch - '0';
14640 while (--ctx->fmtcnt >= 0) {
14641 arg->ch = FORMAT_READ(ctx);
14642 ctx->fmtpos++;
14643 if (arg->ch < '0' || arg->ch > '9')
14644 break;
14645 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14646 mixing signed and unsigned comparison. Since arg->ch is between
14647 '0' and '9', casting to int is safe. */
14648 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14649 PyErr_SetString(PyExc_ValueError,
14650 "width too big");
14651 return -1;
14652 }
14653 arg->width = arg->width*10 + (arg->ch - '0');
14654 }
14655 }
14656
14657 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014658 if (arg->ch == '.') {
14659 arg->prec = 0;
14660 if (--ctx->fmtcnt >= 0) {
14661 arg->ch = FORMAT_READ(ctx);
14662 ctx->fmtpos++;
14663 }
14664 if (arg->ch == '*') {
14665 v = unicode_format_getnextarg(ctx);
14666 if (v == NULL)
14667 return -1;
14668 if (!PyLong_Check(v)) {
14669 PyErr_SetString(PyExc_TypeError,
14670 "* wants int");
14671 return -1;
14672 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014673 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014674 if (arg->prec == -1 && PyErr_Occurred())
14675 return -1;
14676 if (arg->prec < 0)
14677 arg->prec = 0;
14678 if (--ctx->fmtcnt >= 0) {
14679 arg->ch = FORMAT_READ(ctx);
14680 ctx->fmtpos++;
14681 }
14682 }
14683 else if (arg->ch >= '0' && arg->ch <= '9') {
14684 arg->prec = arg->ch - '0';
14685 while (--ctx->fmtcnt >= 0) {
14686 arg->ch = FORMAT_READ(ctx);
14687 ctx->fmtpos++;
14688 if (arg->ch < '0' || arg->ch > '9')
14689 break;
14690 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14691 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014692 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014693 return -1;
14694 }
14695 arg->prec = arg->prec*10 + (arg->ch - '0');
14696 }
14697 }
14698 }
14699
14700 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14701 if (ctx->fmtcnt >= 0) {
14702 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14703 if (--ctx->fmtcnt >= 0) {
14704 arg->ch = FORMAT_READ(ctx);
14705 ctx->fmtpos++;
14706 }
14707 }
14708 }
14709 if (ctx->fmtcnt < 0) {
14710 PyErr_SetString(PyExc_ValueError,
14711 "incomplete format");
14712 return -1;
14713 }
14714 return 0;
14715
14716#undef FORMAT_READ
14717}
14718
14719/* Format one argument. Supported conversion specifiers:
14720
14721 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014722 - "i", "d", "u": int or float
14723 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014724 - "e", "E", "f", "F", "g", "G": float
14725 - "c": int or str (1 character)
14726
Victor Stinner8dbd4212012-12-04 09:30:24 +010014727 When possible, the output is written directly into the Unicode writer
14728 (ctx->writer). A string is created when padding is required.
14729
Victor Stinnera47082312012-10-04 02:19:54 +020014730 Return 0 if the argument has been formatted into *p_str,
14731 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014732 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014733static int
14734unicode_format_arg_format(struct unicode_formatter_t *ctx,
14735 struct unicode_format_arg_t *arg,
14736 PyObject **p_str)
14737{
14738 PyObject *v;
14739 _PyUnicodeWriter *writer = &ctx->writer;
14740
14741 if (ctx->fmtcnt == 0)
14742 ctx->writer.overallocate = 0;
14743
Victor Stinnera47082312012-10-04 02:19:54 +020014744 v = unicode_format_getnextarg(ctx);
14745 if (v == NULL)
14746 return -1;
14747
Victor Stinnera47082312012-10-04 02:19:54 +020014748
14749 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014750 case 's':
14751 case 'r':
14752 case 'a':
14753 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14754 /* Fast path */
14755 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14756 return -1;
14757 return 1;
14758 }
14759
14760 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14761 *p_str = v;
14762 Py_INCREF(*p_str);
14763 }
14764 else {
14765 if (arg->ch == 's')
14766 *p_str = PyObject_Str(v);
14767 else if (arg->ch == 'r')
14768 *p_str = PyObject_Repr(v);
14769 else
14770 *p_str = PyObject_ASCII(v);
14771 }
14772 break;
14773
14774 case 'i':
14775 case 'd':
14776 case 'u':
14777 case 'o':
14778 case 'x':
14779 case 'X':
14780 {
14781 int ret = mainformatlong(v, arg, p_str, writer);
14782 if (ret != 0)
14783 return ret;
14784 arg->sign = 1;
14785 break;
14786 }
14787
14788 case 'e':
14789 case 'E':
14790 case 'f':
14791 case 'F':
14792 case 'g':
14793 case 'G':
14794 if (arg->width == -1 && arg->prec == -1
14795 && !(arg->flags & (F_SIGN | F_BLANK)))
14796 {
14797 /* Fast path */
14798 if (formatfloat(v, arg, NULL, writer) == -1)
14799 return -1;
14800 return 1;
14801 }
14802
14803 arg->sign = 1;
14804 if (formatfloat(v, arg, p_str, NULL) == -1)
14805 return -1;
14806 break;
14807
14808 case 'c':
14809 {
14810 Py_UCS4 ch = formatchar(v);
14811 if (ch == (Py_UCS4) -1)
14812 return -1;
14813 if (arg->width == -1 && arg->prec == -1) {
14814 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014815 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014816 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014817 return 1;
14818 }
14819 *p_str = PyUnicode_FromOrdinal(ch);
14820 break;
14821 }
14822
14823 default:
14824 PyErr_Format(PyExc_ValueError,
14825 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014826 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014827 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14828 (int)arg->ch,
14829 ctx->fmtpos - 1);
14830 return -1;
14831 }
14832 if (*p_str == NULL)
14833 return -1;
14834 assert (PyUnicode_Check(*p_str));
14835 return 0;
14836}
14837
14838static int
14839unicode_format_arg_output(struct unicode_formatter_t *ctx,
14840 struct unicode_format_arg_t *arg,
14841 PyObject *str)
14842{
14843 Py_ssize_t len;
14844 enum PyUnicode_Kind kind;
14845 void *pbuf;
14846 Py_ssize_t pindex;
14847 Py_UCS4 signchar;
14848 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014849 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014850 Py_ssize_t sublen;
14851 _PyUnicodeWriter *writer = &ctx->writer;
14852 Py_UCS4 fill;
14853
14854 fill = ' ';
14855 if (arg->sign && arg->flags & F_ZERO)
14856 fill = '0';
14857
14858 if (PyUnicode_READY(str) == -1)
14859 return -1;
14860
14861 len = PyUnicode_GET_LENGTH(str);
14862 if ((arg->width == -1 || arg->width <= len)
14863 && (arg->prec == -1 || arg->prec >= len)
14864 && !(arg->flags & (F_SIGN | F_BLANK)))
14865 {
14866 /* Fast path */
14867 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14868 return -1;
14869 return 0;
14870 }
14871
14872 /* Truncate the string for "s", "r" and "a" formats
14873 if the precision is set */
14874 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14875 if (arg->prec >= 0 && len > arg->prec)
14876 len = arg->prec;
14877 }
14878
14879 /* Adjust sign and width */
14880 kind = PyUnicode_KIND(str);
14881 pbuf = PyUnicode_DATA(str);
14882 pindex = 0;
14883 signchar = '\0';
14884 if (arg->sign) {
14885 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14886 if (ch == '-' || ch == '+') {
14887 signchar = ch;
14888 len--;
14889 pindex++;
14890 }
14891 else if (arg->flags & F_SIGN)
14892 signchar = '+';
14893 else if (arg->flags & F_BLANK)
14894 signchar = ' ';
14895 else
14896 arg->sign = 0;
14897 }
14898 if (arg->width < len)
14899 arg->width = len;
14900
14901 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014902 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014903 if (!(arg->flags & F_LJUST)) {
14904 if (arg->sign) {
14905 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014906 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014907 }
14908 else {
14909 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014910 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014911 }
14912 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014913 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14914 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014915 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014916 }
14917
Victor Stinnera47082312012-10-04 02:19:54 +020014918 buflen = arg->width;
14919 if (arg->sign && len == arg->width)
14920 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014921 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014922 return -1;
14923
14924 /* Write the sign if needed */
14925 if (arg->sign) {
14926 if (fill != ' ') {
14927 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14928 writer->pos += 1;
14929 }
14930 if (arg->width > len)
14931 arg->width--;
14932 }
14933
14934 /* Write the numeric prefix for "x", "X" and "o" formats
14935 if the alternate form is used.
14936 For example, write "0x" for the "%#x" format. */
14937 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14938 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14939 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14940 if (fill != ' ') {
14941 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14942 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14943 writer->pos += 2;
14944 pindex += 2;
14945 }
14946 arg->width -= 2;
14947 if (arg->width < 0)
14948 arg->width = 0;
14949 len -= 2;
14950 }
14951
14952 /* Pad left with the fill character if needed */
14953 if (arg->width > len && !(arg->flags & F_LJUST)) {
14954 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014955 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014956 writer->pos += sublen;
14957 arg->width = len;
14958 }
14959
14960 /* If padding with spaces: write sign if needed and/or numeric prefix if
14961 the alternate form is used */
14962 if (fill == ' ') {
14963 if (arg->sign) {
14964 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14965 writer->pos += 1;
14966 }
14967 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14968 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14969 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14970 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14971 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14972 writer->pos += 2;
14973 pindex += 2;
14974 }
14975 }
14976
14977 /* Write characters */
14978 if (len) {
14979 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14980 str, pindex, len);
14981 writer->pos += len;
14982 }
14983
14984 /* Pad right with the fill character if needed */
14985 if (arg->width > len) {
14986 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014987 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014988 writer->pos += sublen;
14989 }
14990 return 0;
14991}
14992
14993/* Helper of PyUnicode_Format(): format one arg.
14994 Return 0 on success, raise an exception and return -1 on error. */
14995static int
14996unicode_format_arg(struct unicode_formatter_t *ctx)
14997{
14998 struct unicode_format_arg_t arg;
14999 PyObject *str;
15000 int ret;
15001
Victor Stinner8dbd4212012-12-04 09:30:24 +010015002 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015003 if (arg.ch == '%') {
15004 ctx->fmtpos++;
15005 ctx->fmtcnt--;
15006 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15007 return -1;
15008 return 0;
15009 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015010 arg.flags = 0;
15011 arg.width = -1;
15012 arg.prec = -1;
15013 arg.sign = 0;
15014 str = NULL;
15015
Victor Stinnera47082312012-10-04 02:19:54 +020015016 ret = unicode_format_arg_parse(ctx, &arg);
15017 if (ret == -1)
15018 return -1;
15019
15020 ret = unicode_format_arg_format(ctx, &arg, &str);
15021 if (ret == -1)
15022 return -1;
15023
15024 if (ret != 1) {
15025 ret = unicode_format_arg_output(ctx, &arg, str);
15026 Py_DECREF(str);
15027 if (ret == -1)
15028 return -1;
15029 }
15030
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015031 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015032 PyErr_SetString(PyExc_TypeError,
15033 "not all arguments converted during string formatting");
15034 return -1;
15035 }
15036 return 0;
15037}
15038
Alexander Belopolsky40018472011-02-26 01:02:56 +000015039PyObject *
15040PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015041{
Victor Stinnera47082312012-10-04 02:19:54 +020015042 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015043
Guido van Rossumd57fd912000-03-10 22:53:23 +000015044 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015045 PyErr_BadInternalCall();
15046 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015047 }
Victor Stinnera47082312012-10-04 02:19:54 +020015048
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015049 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015050 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015051
15052 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015053 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15054 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15055 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15056 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015057
Victor Stinner8f674cc2013-04-17 23:02:17 +020015058 _PyUnicodeWriter_Init(&ctx.writer);
15059 ctx.writer.min_length = ctx.fmtcnt + 100;
15060 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015061
Guido van Rossumd57fd912000-03-10 22:53:23 +000015062 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015063 ctx.arglen = PyTuple_Size(args);
15064 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015065 }
15066 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015067 ctx.arglen = -1;
15068 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015069 }
Victor Stinnera47082312012-10-04 02:19:54 +020015070 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015071 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015072 ctx.dict = args;
15073 else
15074 ctx.dict = NULL;
15075 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015076
Victor Stinnera47082312012-10-04 02:19:54 +020015077 while (--ctx.fmtcnt >= 0) {
15078 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015079 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015080
15081 nonfmtpos = ctx.fmtpos++;
15082 while (ctx.fmtcnt >= 0 &&
15083 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15084 ctx.fmtpos++;
15085 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015086 }
Victor Stinnera47082312012-10-04 02:19:54 +020015087 if (ctx.fmtcnt < 0) {
15088 ctx.fmtpos--;
15089 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015090 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015091
Victor Stinnercfc4c132013-04-03 01:48:39 +020015092 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15093 nonfmtpos, ctx.fmtpos) < 0)
15094 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015095 }
15096 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015097 ctx.fmtpos++;
15098 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015099 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015100 }
15101 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015102
Victor Stinnera47082312012-10-04 02:19:54 +020015103 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015104 PyErr_SetString(PyExc_TypeError,
15105 "not all arguments converted during string formatting");
15106 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015107 }
15108
Victor Stinnera47082312012-10-04 02:19:54 +020015109 if (ctx.args_owned) {
15110 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015111 }
Victor Stinnera47082312012-10-04 02:19:54 +020015112 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015113
Benjamin Peterson29060642009-01-31 22:14:21 +000015114 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015115 _PyUnicodeWriter_Dealloc(&ctx.writer);
15116 if (ctx.args_owned) {
15117 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015118 }
15119 return NULL;
15120}
15121
Jeremy Hylton938ace62002-07-17 16:30:39 +000015122static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015123unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15124
Tim Peters6d6c1a32001-08-02 04:15:00 +000015125static PyObject *
15126unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15127{
Benjamin Peterson29060642009-01-31 22:14:21 +000015128 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015129 static char *kwlist[] = {"object", "encoding", "errors", 0};
15130 char *encoding = NULL;
15131 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015132
Benjamin Peterson14339b62009-01-31 16:36:08 +000015133 if (type != &PyUnicode_Type)
15134 return unicode_subtype_new(type, args, kwds);
15135 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015136 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015137 return NULL;
15138 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015139 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015140 if (encoding == NULL && errors == NULL)
15141 return PyObject_Str(x);
15142 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015143 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015144}
15145
Guido van Rossume023fe02001-08-30 03:12:59 +000015146static PyObject *
15147unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15148{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015149 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015150 Py_ssize_t length, char_size;
15151 int share_wstr, share_utf8;
15152 unsigned int kind;
15153 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015154
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015156
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015157 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015158 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015160 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015161 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015162 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015163 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015164 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015165
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015166 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015167 if (self == NULL) {
15168 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 return NULL;
15170 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015171 kind = PyUnicode_KIND(unicode);
15172 length = PyUnicode_GET_LENGTH(unicode);
15173
15174 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015175#ifdef Py_DEBUG
15176 _PyUnicode_HASH(self) = -1;
15177#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015178 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015179#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015180 _PyUnicode_STATE(self).interned = 0;
15181 _PyUnicode_STATE(self).kind = kind;
15182 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015183 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015184 _PyUnicode_STATE(self).ready = 1;
15185 _PyUnicode_WSTR(self) = NULL;
15186 _PyUnicode_UTF8_LENGTH(self) = 0;
15187 _PyUnicode_UTF8(self) = NULL;
15188 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015189 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015190
15191 share_utf8 = 0;
15192 share_wstr = 0;
15193 if (kind == PyUnicode_1BYTE_KIND) {
15194 char_size = 1;
15195 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15196 share_utf8 = 1;
15197 }
15198 else if (kind == PyUnicode_2BYTE_KIND) {
15199 char_size = 2;
15200 if (sizeof(wchar_t) == 2)
15201 share_wstr = 1;
15202 }
15203 else {
15204 assert(kind == PyUnicode_4BYTE_KIND);
15205 char_size = 4;
15206 if (sizeof(wchar_t) == 4)
15207 share_wstr = 1;
15208 }
15209
15210 /* Ensure we won't overflow the length. */
15211 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15212 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015213 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015214 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015215 data = PyObject_MALLOC((length + 1) * char_size);
15216 if (data == NULL) {
15217 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015218 goto onError;
15219 }
15220
Victor Stinnerc3c74152011-10-02 20:39:55 +020015221 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015222 if (share_utf8) {
15223 _PyUnicode_UTF8_LENGTH(self) = length;
15224 _PyUnicode_UTF8(self) = data;
15225 }
15226 if (share_wstr) {
15227 _PyUnicode_WSTR_LENGTH(self) = length;
15228 _PyUnicode_WSTR(self) = (wchar_t *)data;
15229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015230
Christian Heimesf051e432016-09-13 20:22:02 +020015231 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015232 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015233 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015234#ifdef Py_DEBUG
15235 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15236#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015237 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015238 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015239
15240onError:
15241 Py_DECREF(unicode);
15242 Py_DECREF(self);
15243 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015244}
15245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015246PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015247"str(object='') -> str\n\
15248str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015249\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015250Create a new string object from the given object. If encoding or\n\
15251errors is specified, then the object must expose a data buffer\n\
15252that will be decoded using the given encoding and error handler.\n\
15253Otherwise, returns the result of object.__str__() (if defined)\n\
15254or repr(object).\n\
15255encoding defaults to sys.getdefaultencoding().\n\
15256errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015257
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015258static PyObject *unicode_iter(PyObject *seq);
15259
Guido van Rossumd57fd912000-03-10 22:53:23 +000015260PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015261 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015262 "str", /* tp_name */
15263 sizeof(PyUnicodeObject), /* tp_basicsize */
15264 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015265 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015266 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015267 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015268 0, /* tp_getattr */
15269 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015270 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015271 unicode_repr, /* tp_repr */
15272 &unicode_as_number, /* tp_as_number */
15273 &unicode_as_sequence, /* tp_as_sequence */
15274 &unicode_as_mapping, /* tp_as_mapping */
15275 (hashfunc) unicode_hash, /* tp_hash*/
15276 0, /* tp_call*/
15277 (reprfunc) unicode_str, /* tp_str */
15278 PyObject_GenericGetAttr, /* tp_getattro */
15279 0, /* tp_setattro */
15280 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015282 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15283 unicode_doc, /* tp_doc */
15284 0, /* tp_traverse */
15285 0, /* tp_clear */
15286 PyUnicode_RichCompare, /* tp_richcompare */
15287 0, /* tp_weaklistoffset */
15288 unicode_iter, /* tp_iter */
15289 0, /* tp_iternext */
15290 unicode_methods, /* tp_methods */
15291 0, /* tp_members */
15292 0, /* tp_getset */
15293 &PyBaseObject_Type, /* tp_base */
15294 0, /* tp_dict */
15295 0, /* tp_descr_get */
15296 0, /* tp_descr_set */
15297 0, /* tp_dictoffset */
15298 0, /* tp_init */
15299 0, /* tp_alloc */
15300 unicode_new, /* tp_new */
15301 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015302};
15303
15304/* Initialize the Unicode implementation */
15305
Victor Stinner331a6a52019-05-27 16:39:22 +020015306PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015307_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015308{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015309 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015310 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015311 0x000A, /* LINE FEED */
15312 0x000D, /* CARRIAGE RETURN */
15313 0x001C, /* FILE SEPARATOR */
15314 0x001D, /* GROUP SEPARATOR */
15315 0x001E, /* RECORD SEPARATOR */
15316 0x0085, /* NEXT LINE */
15317 0x2028, /* LINE SEPARATOR */
15318 0x2029, /* PARAGRAPH SEPARATOR */
15319 };
15320
Fred Drakee4315f52000-05-09 19:53:39 +000015321 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015322 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015323 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015324 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015325 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015326 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015327
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015328 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015329 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015330 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015331
15332 /* initialize the linebreak bloom filter */
15333 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015334 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015335 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015336
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015337 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015338 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015339 }
15340 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015341 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015342 }
15343 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015344 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015345 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015346 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015347}
15348
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015349
Walter Dörwald16807132007-05-25 13:52:07 +000015350void
15351PyUnicode_InternInPlace(PyObject **p)
15352{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015353 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015355#ifdef Py_DEBUG
15356 assert(s != NULL);
15357 assert(_PyUnicode_CHECK(s));
15358#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015359 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015360 return;
15361#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 /* If it's a subclass, we don't really know what putting
15363 it in the interned dict might do. */
15364 if (!PyUnicode_CheckExact(s))
15365 return;
15366 if (PyUnicode_CHECK_INTERNED(s))
15367 return;
15368 if (interned == NULL) {
15369 interned = PyDict_New();
15370 if (interned == NULL) {
15371 PyErr_Clear(); /* Don't leave an exception */
15372 return;
15373 }
15374 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015376 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015378 if (t == NULL) {
15379 PyErr_Clear();
15380 return;
15381 }
15382 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015383 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015384 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015385 return;
15386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 /* The two references in interned are not counted by refcnt.
15388 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015389 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015390 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015391}
15392
15393void
15394PyUnicode_InternImmortal(PyObject **p)
15395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 PyUnicode_InternInPlace(p);
15397 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015398 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 Py_INCREF(*p);
15400 }
Walter Dörwald16807132007-05-25 13:52:07 +000015401}
15402
15403PyObject *
15404PyUnicode_InternFromString(const char *cp)
15405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 PyObject *s = PyUnicode_FromString(cp);
15407 if (s == NULL)
15408 return NULL;
15409 PyUnicode_InternInPlace(&s);
15410 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015411}
15412
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015413
15414#if defined(WITH_VALGRIND) || defined(__INSURE__)
15415static void
15416unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015417{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015418 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015420 }
15421 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 if (keys == NULL || !PyList_Check(keys)) {
15423 PyErr_Clear();
15424 return;
15425 }
Walter Dörwald16807132007-05-25 13:52:07 +000015426
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015427 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 detector, interned unicode strings are not forcibly deallocated;
15429 rather, we give them their stolen references back, and then clear
15430 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015431
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015432 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015433#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015435 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015436
15437 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015438#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015439 for (Py_ssize_t i = 0; i < n; i++) {
15440 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015441 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015442 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015444 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015445 case SSTATE_INTERNED_IMMORTAL:
15446 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015447#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015448 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015449#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 break;
15451 case SSTATE_INTERNED_MORTAL:
15452 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015453#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015454 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015455#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015456 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015457 case SSTATE_NOT_INTERNED:
15458 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015460 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015462 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015463 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015464#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015465 fprintf(stderr, "total size of all interned strings: "
15466 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15467 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015468#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 Py_DECREF(keys);
15470 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015471 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015472}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015473#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015474
15475
15476/********************* Unicode Iterator **************************/
15477
15478typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015479 PyObject_HEAD
15480 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015481 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015482} unicodeiterobject;
15483
15484static void
15485unicodeiter_dealloc(unicodeiterobject *it)
15486{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 _PyObject_GC_UNTRACK(it);
15488 Py_XDECREF(it->it_seq);
15489 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015490}
15491
15492static int
15493unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15494{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 Py_VISIT(it->it_seq);
15496 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015497}
15498
15499static PyObject *
15500unicodeiter_next(unicodeiterobject *it)
15501{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015502 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015503
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 assert(it != NULL);
15505 seq = it->it_seq;
15506 if (seq == NULL)
15507 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015508 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015510 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15511 int kind = PyUnicode_KIND(seq);
15512 void *data = PyUnicode_DATA(seq);
15513 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15514 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015515 if (item != NULL)
15516 ++it->it_index;
15517 return item;
15518 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015519
Benjamin Peterson14339b62009-01-31 16:36:08 +000015520 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015521 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015522 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015523}
15524
15525static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015526unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015527{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015528 Py_ssize_t len = 0;
15529 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015530 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015532}
15533
15534PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15535
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015536static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015537unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015538{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015539 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015540 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015541 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015542 it->it_seq, it->it_index);
15543 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015544 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015545 if (u == NULL)
15546 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015547 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015548 }
15549}
15550
15551PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15552
15553static PyObject *
15554unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15555{
15556 Py_ssize_t index = PyLong_AsSsize_t(state);
15557 if (index == -1 && PyErr_Occurred())
15558 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015559 if (it->it_seq != NULL) {
15560 if (index < 0)
15561 index = 0;
15562 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15563 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15564 it->it_index = index;
15565 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015566 Py_RETURN_NONE;
15567}
15568
15569PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15570
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015571static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015573 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015574 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15575 reduce_doc},
15576 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15577 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015578 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015579};
15580
15581PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015582 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15583 "str_iterator", /* tp_name */
15584 sizeof(unicodeiterobject), /* tp_basicsize */
15585 0, /* tp_itemsize */
15586 /* methods */
15587 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015588 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015589 0, /* tp_getattr */
15590 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015591 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015592 0, /* tp_repr */
15593 0, /* tp_as_number */
15594 0, /* tp_as_sequence */
15595 0, /* tp_as_mapping */
15596 0, /* tp_hash */
15597 0, /* tp_call */
15598 0, /* tp_str */
15599 PyObject_GenericGetAttr, /* tp_getattro */
15600 0, /* tp_setattro */
15601 0, /* tp_as_buffer */
15602 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15603 0, /* tp_doc */
15604 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15605 0, /* tp_clear */
15606 0, /* tp_richcompare */
15607 0, /* tp_weaklistoffset */
15608 PyObject_SelfIter, /* tp_iter */
15609 (iternextfunc)unicodeiter_next, /* tp_iternext */
15610 unicodeiter_methods, /* tp_methods */
15611 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015612};
15613
15614static PyObject *
15615unicode_iter(PyObject *seq)
15616{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015617 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015618
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 if (!PyUnicode_Check(seq)) {
15620 PyErr_BadInternalCall();
15621 return NULL;
15622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015623 if (PyUnicode_READY(seq) == -1)
15624 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15626 if (it == NULL)
15627 return NULL;
15628 it->it_index = 0;
15629 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015630 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015631 _PyObject_GC_TRACK(it);
15632 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015633}
15634
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015635
15636size_t
15637Py_UNICODE_strlen(const Py_UNICODE *u)
15638{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015639 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015640}
15641
15642Py_UNICODE*
15643Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15644{
15645 Py_UNICODE *u = s1;
15646 while ((*u++ = *s2++));
15647 return s1;
15648}
15649
15650Py_UNICODE*
15651Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15652{
15653 Py_UNICODE *u = s1;
15654 while ((*u++ = *s2++))
15655 if (n-- == 0)
15656 break;
15657 return s1;
15658}
15659
15660Py_UNICODE*
15661Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15662{
15663 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015664 u1 += wcslen(u1);
15665 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015666 return s1;
15667}
15668
15669int
15670Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15671{
15672 while (*s1 && *s2 && *s1 == *s2)
15673 s1++, s2++;
15674 if (*s1 && *s2)
15675 return (*s1 < *s2) ? -1 : +1;
15676 if (*s1)
15677 return 1;
15678 if (*s2)
15679 return -1;
15680 return 0;
15681}
15682
15683int
15684Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15685{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015686 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015687 for (; n != 0; n--) {
15688 u1 = *s1;
15689 u2 = *s2;
15690 if (u1 != u2)
15691 return (u1 < u2) ? -1 : +1;
15692 if (u1 == '\0')
15693 return 0;
15694 s1++;
15695 s2++;
15696 }
15697 return 0;
15698}
15699
15700Py_UNICODE*
15701Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15702{
15703 const Py_UNICODE *p;
15704 for (p = s; *p; p++)
15705 if (*p == c)
15706 return (Py_UNICODE*)p;
15707 return NULL;
15708}
15709
15710Py_UNICODE*
15711Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15712{
15713 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015714 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015715 while (p != s) {
15716 p--;
15717 if (*p == c)
15718 return (Py_UNICODE*)p;
15719 }
15720 return NULL;
15721}
Victor Stinner331ea922010-08-10 16:37:20 +000015722
Victor Stinner71133ff2010-09-01 23:43:53 +000015723Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015724PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015725{
Victor Stinner577db2c2011-10-11 22:12:48 +020015726 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015727 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015729 if (!PyUnicode_Check(unicode)) {
15730 PyErr_BadArgument();
15731 return NULL;
15732 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015733 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015734 if (u == NULL)
15735 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015736 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015737 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015738 PyErr_NoMemory();
15739 return NULL;
15740 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015741 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015742 size *= sizeof(Py_UNICODE);
15743 copy = PyMem_Malloc(size);
15744 if (copy == NULL) {
15745 PyErr_NoMemory();
15746 return NULL;
15747 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015748 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015749 return copy;
15750}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015751
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015752
Victor Stinner709d23d2019-05-02 14:56:30 -040015753static int
15754encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015755{
Victor Stinner709d23d2019-05-02 14:56:30 -040015756 int res;
15757 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15758 if (res == -2) {
15759 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15760 return -1;
15761 }
15762 if (res < 0) {
15763 PyErr_NoMemory();
15764 return -1;
15765 }
15766 return 0;
15767}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015768
Victor Stinner709d23d2019-05-02 14:56:30 -040015769
15770static int
15771config_get_codec_name(wchar_t **config_encoding)
15772{
15773 char *encoding;
15774 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15775 return -1;
15776 }
15777
15778 PyObject *name_obj = NULL;
15779 PyObject *codec = _PyCodec_Lookup(encoding);
15780 PyMem_RawFree(encoding);
15781
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015782 if (!codec)
15783 goto error;
15784
15785 name_obj = PyObject_GetAttrString(codec, "name");
15786 Py_CLEAR(codec);
15787 if (!name_obj) {
15788 goto error;
15789 }
15790
Victor Stinner709d23d2019-05-02 14:56:30 -040015791 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15792 Py_DECREF(name_obj);
15793 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015794 goto error;
15795 }
15796
Victor Stinner709d23d2019-05-02 14:56:30 -040015797 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15798 if (raw_wname == NULL) {
15799 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015800 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015801 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015802 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015803
15804 PyMem_RawFree(*config_encoding);
15805 *config_encoding = raw_wname;
15806
15807 PyMem_Free(wname);
15808 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015809
15810error:
15811 Py_XDECREF(codec);
15812 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015813 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015814}
15815
15816
Victor Stinner331a6a52019-05-27 16:39:22 +020015817static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015818init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015819{
Victor Stinner709d23d2019-05-02 14:56:30 -040015820 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015821 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015822 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015823 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015824 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015825 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015826 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015827}
15828
15829
Victor Stinner709d23d2019-05-02 14:56:30 -040015830static int
15831init_fs_codec(PyInterpreterState *interp)
15832{
Victor Stinner331a6a52019-05-27 16:39:22 +020015833 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015834
15835 _Py_error_handler error_handler;
15836 error_handler = get_error_handler_wide(config->filesystem_errors);
15837 if (error_handler == _Py_ERROR_UNKNOWN) {
15838 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15839 return -1;
15840 }
15841
15842 char *encoding, *errors;
15843 if (encode_wstr_utf8(config->filesystem_encoding,
15844 &encoding,
15845 "filesystem_encoding") < 0) {
15846 return -1;
15847 }
15848
15849 if (encode_wstr_utf8(config->filesystem_errors,
15850 &errors,
15851 "filesystem_errors") < 0) {
15852 PyMem_RawFree(encoding);
15853 return -1;
15854 }
15855
15856 PyMem_RawFree(interp->fs_codec.encoding);
15857 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015858 /* encoding has been normalized by init_fs_encoding() */
15859 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015860 PyMem_RawFree(interp->fs_codec.errors);
15861 interp->fs_codec.errors = errors;
15862 interp->fs_codec.error_handler = error_handler;
15863
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015864#ifdef _Py_FORCE_UTF8_FS_ENCODING
15865 assert(interp->fs_codec.utf8 == 1);
15866#endif
15867
Victor Stinner709d23d2019-05-02 14:56:30 -040015868 /* At this point, PyUnicode_EncodeFSDefault() and
15869 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15870 the C implementation of the filesystem encoding. */
15871
15872 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15873 global configuration variables. */
15874 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15875 interp->fs_codec.errors) < 0) {
15876 PyErr_NoMemory();
15877 return -1;
15878 }
15879 return 0;
15880}
15881
15882
Victor Stinner331a6a52019-05-27 16:39:22 +020015883static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015884init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015885{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015886 PyInterpreterState *interp = tstate->interp;
15887
Victor Stinner709d23d2019-05-02 14:56:30 -040015888 /* Update the filesystem encoding to the normalized Python codec name.
15889 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15890 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015891 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015892 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015893 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015894 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015895 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015896 }
15897
Victor Stinner709d23d2019-05-02 14:56:30 -040015898 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015899 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015900 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015901 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015902}
15903
15904
Victor Stinner331a6a52019-05-27 16:39:22 +020015905PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015906_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015907{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015908 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015909 if (_PyStatus_EXCEPTION(status)) {
15910 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015911 }
15912
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015913 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015914}
15915
15916
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015917static void
15918_PyUnicode_FiniEncodings(PyThreadState *tstate)
15919{
15920 PyInterpreterState *interp = tstate->interp;
15921 PyMem_RawFree(interp->fs_codec.encoding);
15922 interp->fs_codec.encoding = NULL;
15923 interp->fs_codec.utf8 = 0;
15924 PyMem_RawFree(interp->fs_codec.errors);
15925 interp->fs_codec.errors = NULL;
15926 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15927}
15928
15929
Victor Stinner709d23d2019-05-02 14:56:30 -040015930#ifdef MS_WINDOWS
15931int
15932_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15933{
15934 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015935 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015936
15937 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15938 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15939 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15940 if (encoding == NULL || errors == NULL) {
15941 PyMem_RawFree(encoding);
15942 PyMem_RawFree(errors);
15943 PyErr_NoMemory();
15944 return -1;
15945 }
15946
15947 PyMem_RawFree(config->filesystem_encoding);
15948 config->filesystem_encoding = encoding;
15949 PyMem_RawFree(config->filesystem_errors);
15950 config->filesystem_errors = errors;
15951
15952 return init_fs_codec(interp);
15953}
15954#endif
15955
15956
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015957void
Victor Stinner3d483342019-11-22 12:27:50 +010015958_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015959{
Victor Stinner3d483342019-11-22 12:27:50 +010015960 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015961#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010015962 /* Insure++ is a memory analysis tool that aids in discovering
15963 * memory leaks and other memory problems. On Python exit, the
15964 * interned string dictionaries are flagged as being in use at exit
15965 * (which it is). Under normal circumstances, this is fine because
15966 * the memory will be automatically reclaimed by the system. Under
15967 * memory debugging, it's a huge source of useless noise, so we
15968 * trade off slower shutdown for less distraction in the memory
15969 * reports. -baw
15970 */
15971 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015972#endif /* __INSURE__ */
15973
Victor Stinner3d483342019-11-22 12:27:50 +010015974 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015975
Victor Stinner3d483342019-11-22 12:27:50 +010015976 for (Py_ssize_t i = 0; i < 256; i++) {
15977 Py_CLEAR(unicode_latin1[i]);
15978 }
15979 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015980 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015981
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015982 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015983}
15984
15985
Georg Brandl66c221e2010-10-14 07:04:07 +000015986/* A _string module, to export formatter_parser and formatter_field_name_split
15987 to the string.Formatter class implemented in Python. */
15988
15989static PyMethodDef _string_methods[] = {
15990 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15991 METH_O, PyDoc_STR("split the argument as a field name")},
15992 {"formatter_parser", (PyCFunction) formatter_parser,
15993 METH_O, PyDoc_STR("parse the argument as a format string")},
15994 {NULL, NULL}
15995};
15996
15997static struct PyModuleDef _string_module = {
15998 PyModuleDef_HEAD_INIT,
15999 "_string",
16000 PyDoc_STR("string helper module"),
16001 0,
16002 _string_methods,
16003 NULL,
16004 NULL,
16005 NULL,
16006 NULL
16007};
16008
16009PyMODINIT_FUNC
16010PyInit__string(void)
16011{
16012 return PyModule_Create(&_string_module);
16013}
16014
16015
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016016#ifdef __cplusplus
16017}
16018#endif