blob: 2d60627b19193bceb6949af895a58ad78f569f50 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020044#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040047#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010048#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000049#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050050#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
175 const from_type *_iter = (from_type *)(begin); \
176 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
968 else
969 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
974 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
975 else
976 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
979 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
980 else
981 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner84def372011-12-11 20:04:56 +01001040 _Py_DEC_REFTOTAL;
1041 _Py_ForgetReference(unicode);
1042
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001043 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001044 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001045 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyErr_NoMemory();
1047 return NULL;
1048 }
Victor Stinner84def372011-12-11 20:04:56 +01001049 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001051
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001053 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001055 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001056 _PyUnicode_WSTR_LENGTH(unicode) = length;
1057 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001058 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1059 PyObject_DEL(_PyUnicode_WSTR(unicode));
1060 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001061 if (!PyUnicode_IS_ASCII(unicode))
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001063 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001064#ifdef Py_DEBUG
1065 unicode_fill_invalid(unicode, old_length);
1066#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1068 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001069 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 return unicode;
1071}
1072
Alexander Belopolsky40018472011-02-26 01:02:56 +00001073static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001074resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075{
Victor Stinner95663112011-10-04 01:03:50 +02001076 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001077 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001080
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 if (PyUnicode_IS_READY(unicode)) {
1082 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001083 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001085#ifdef Py_DEBUG
1086 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1087#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088
1089 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001090 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001091 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1092 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001093
1094 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1095 PyErr_NoMemory();
1096 return -1;
1097 }
1098 new_size = (length + 1) * char_size;
1099
Victor Stinner7a9105a2011-12-12 00:13:42 +01001100 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1101 {
1102 PyObject_DEL(_PyUnicode_UTF8(unicode));
1103 _PyUnicode_UTF8(unicode) = NULL;
1104 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1105 }
1106
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 data = (PyObject *)PyObject_REALLOC(data, new_size);
1108 if (data == NULL) {
1109 PyErr_NoMemory();
1110 return -1;
1111 }
1112 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001113 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001114 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001115 _PyUnicode_WSTR_LENGTH(unicode) = length;
1116 }
1117 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001118 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_UTF8_LENGTH(unicode) = length;
1120 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 _PyUnicode_LENGTH(unicode) = length;
1122 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001123#ifdef Py_DEBUG
1124 unicode_fill_invalid(unicode, old_length);
1125#endif
Victor Stinner95663112011-10-04 01:03:50 +02001126 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001127 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001130 }
Victor Stinner95663112011-10-04 01:03:50 +02001131 assert(_PyUnicode_WSTR(unicode) != NULL);
1132
1133 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001134 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001135 PyErr_NoMemory();
1136 return -1;
1137 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001138 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001139 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001140 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001141 if (!wstr) {
1142 PyErr_NoMemory();
1143 return -1;
1144 }
1145 _PyUnicode_WSTR(unicode) = wstr;
1146 _PyUnicode_WSTR(unicode)[length] = 0;
1147 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001148 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 return 0;
1150}
1151
Victor Stinnerfe226c02011-10-03 03:52:20 +02001152static PyObject*
1153resize_copy(PyObject *unicode, Py_ssize_t length)
1154{
1155 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001156 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001158
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001159 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160
1161 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1162 if (copy == NULL)
1163 return NULL;
1164
1165 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001166 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001167 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001168 }
1169 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001170 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001171
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001172 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001173 if (w == NULL)
1174 return NULL;
1175 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1176 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001177 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001178 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001179 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180 }
1181}
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001184 Ux0000 terminated; some code (e.g. new_identifier)
1185 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
1187 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001188 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189
1190*/
1191
Alexander Belopolsky40018472011-02-26 01:02:56 +00001192static PyUnicodeObject *
1193_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001195 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197
Thomas Wouters477c8d52006-05-27 19:21:47 +00001198 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 if (length == 0 && unicode_empty != NULL) {
1200 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001201 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 }
1203
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001204 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001205 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001206 return (PyUnicodeObject *)PyErr_NoMemory();
1207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 if (length < 0) {
1209 PyErr_SetString(PyExc_SystemError,
1210 "Negative size passed to _PyUnicode_New");
1211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212 }
1213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1215 if (unicode == NULL)
1216 return NULL;
1217 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001218
1219 _PyUnicode_WSTR_LENGTH(unicode) = length;
1220 _PyUnicode_HASH(unicode) = -1;
1221 _PyUnicode_STATE(unicode).interned = 0;
1222 _PyUnicode_STATE(unicode).kind = 0;
1223 _PyUnicode_STATE(unicode).compact = 0;
1224 _PyUnicode_STATE(unicode).ready = 0;
1225 _PyUnicode_STATE(unicode).ascii = 0;
1226 _PyUnicode_DATA_ANY(unicode) = NULL;
1227 _PyUnicode_LENGTH(unicode) = 0;
1228 _PyUnicode_UTF8(unicode) = NULL;
1229 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1232 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001233 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001235 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237
Jeremy Hyltond8082792003-09-16 19:41:39 +00001238 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001239 * the caller fails before initializing str -- unicode_resize()
1240 * reads str[0], and the Keep-Alive optimization can keep memory
1241 * allocated for str alive across a call to unicode_dealloc(unicode).
1242 * We don't want unicode_resize to read uninitialized memory in
1243 * that case.
1244 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 _PyUnicode_WSTR(unicode)[0] = 0;
1246 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001247
Victor Stinner7931d9a2011-11-04 00:22:48 +01001248 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 return unicode;
1250}
1251
Victor Stinnerf42dc442011-10-02 23:33:16 +02001252static const char*
1253unicode_kind_name(PyObject *unicode)
1254{
Victor Stinner42dfd712011-10-03 14:41:45 +02001255 /* don't check consistency: unicode_kind_name() is called from
1256 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001257 if (!PyUnicode_IS_COMPACT(unicode))
1258 {
1259 if (!PyUnicode_IS_READY(unicode))
1260 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001261 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001262 {
1263 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001264 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001265 return "legacy ascii";
1266 else
1267 return "legacy latin1";
1268 case PyUnicode_2BYTE_KIND:
1269 return "legacy UCS2";
1270 case PyUnicode_4BYTE_KIND:
1271 return "legacy UCS4";
1272 default:
1273 return "<legacy invalid kind>";
1274 }
1275 }
1276 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001277 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001278 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001279 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280 return "ascii";
1281 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001282 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001283 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001284 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 default:
1288 return "<invalid compact kind>";
1289 }
1290}
1291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001294char *_PyUnicode_utf8(void *unicode_raw){
1295 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001296 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297}
1298
Victor Stinnera42de742018-11-22 10:25:22 +01001299void *_PyUnicode_compact_data(void *unicode_raw) {
1300 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301 return _PyUnicode_COMPACT_DATA(unicode);
1302}
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001305 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1307 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1308 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1309 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1310 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1311 return PyUnicode_DATA(unicode);
1312}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001313
1314void
1315_PyUnicode_Dump(PyObject *op)
1316{
1317 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001318 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1319 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1320 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001321
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001323 {
1324 if (ascii->state.ascii)
1325 data = (ascii + 1);
1326 else
1327 data = (compact + 1);
1328 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001329 else
1330 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001331 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1332 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001333
Victor Stinnera849a4b2011-10-03 12:12:11 +02001334 if (ascii->wstr == data)
1335 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001336 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera3b334d2011-10-03 13:53:37 +02001338 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001339 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001340 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1341 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001342 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001343 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001344 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001346}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347#endif
1348
1349PyObject *
1350PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1351{
1352 PyObject *obj;
1353 PyCompactUnicodeObject *unicode;
1354 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001355 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001356 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 Py_ssize_t char_size;
1358 Py_ssize_t struct_size;
1359
1360 /* Optimization for empty strings */
1361 if (size == 0 && unicode_empty != NULL) {
1362 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001363 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 }
1365
Victor Stinner9e9d6892011-10-04 01:02:02 +02001366 is_ascii = 0;
1367 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 struct_size = sizeof(PyCompactUnicodeObject);
1369 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001370 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 char_size = 1;
1372 is_ascii = 1;
1373 struct_size = sizeof(PyASCIIObject);
1374 }
1375 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001376 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 char_size = 1;
1378 }
1379 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 2;
1382 if (sizeof(wchar_t) == 2)
1383 is_sharing = 1;
1384 }
1385 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001386 if (maxchar > MAX_UNICODE) {
1387 PyErr_SetString(PyExc_SystemError,
1388 "invalid maximum character passed to PyUnicode_New");
1389 return NULL;
1390 }
Victor Stinner8f825062012-04-27 13:55:39 +02001391 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 char_size = 4;
1393 if (sizeof(wchar_t) == 4)
1394 is_sharing = 1;
1395 }
1396
1397 /* Ensure we won't overflow the size. */
1398 if (size < 0) {
1399 PyErr_SetString(PyExc_SystemError,
1400 "Negative size passed to PyUnicode_New");
1401 return NULL;
1402 }
1403 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1404 return PyErr_NoMemory();
1405
1406 /* Duplicated allocation code from _PyObject_New() instead of a call to
1407 * PyObject_New() so we are able to allocate space for the object and
1408 * it's data buffer.
1409 */
1410 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1411 if (obj == NULL)
1412 return PyErr_NoMemory();
1413 obj = PyObject_INIT(obj, &PyUnicode_Type);
1414 if (obj == NULL)
1415 return NULL;
1416
1417 unicode = (PyCompactUnicodeObject *)obj;
1418 if (is_ascii)
1419 data = ((PyASCIIObject*)obj) + 1;
1420 else
1421 data = unicode + 1;
1422 _PyUnicode_LENGTH(unicode) = size;
1423 _PyUnicode_HASH(unicode) = -1;
1424 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001425 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 _PyUnicode_STATE(unicode).compact = 1;
1427 _PyUnicode_STATE(unicode).ready = 1;
1428 _PyUnicode_STATE(unicode).ascii = is_ascii;
1429 if (is_ascii) {
1430 ((char*)data)[size] = 0;
1431 _PyUnicode_WSTR(unicode) = NULL;
1432 }
Victor Stinner8f825062012-04-27 13:55:39 +02001433 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001438 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 else {
1441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001445 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 ((Py_UCS4*)data)[size] = 0;
1447 if (is_sharing) {
1448 _PyUnicode_WSTR_LENGTH(unicode) = size;
1449 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1450 }
1451 else {
1452 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1453 _PyUnicode_WSTR(unicode) = NULL;
1454 }
1455 }
Victor Stinner8f825062012-04-27 13:55:39 +02001456#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001457 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001458#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001459 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 return obj;
1461}
1462
1463#if SIZEOF_WCHAR_T == 2
1464/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1465 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001466 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467
1468 This function assumes that unicode can hold one more code point than wstr
1469 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001470static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001472 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473{
1474 const wchar_t *iter;
1475 Py_UCS4 *ucs4_out;
1476
Victor Stinner910337b2011-10-03 03:20:16 +02001477 assert(unicode != NULL);
1478 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1480 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1481
1482 for (iter = begin; iter < end; ) {
1483 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1484 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001485 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1486 && (iter+1) < end
1487 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 {
Victor Stinner551ac952011-11-29 22:58:13 +01001489 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 iter += 2;
1491 }
1492 else {
1493 *ucs4_out++ = *iter;
1494 iter++;
1495 }
1496 }
1497 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1498 _PyUnicode_GET_LENGTH(unicode)));
1499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500}
1501#endif
1502
Victor Stinnercd9950f2011-10-02 00:34:53 +02001503static int
Victor Stinner488fa492011-12-12 00:01:39 +01001504unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001505{
Victor Stinner488fa492011-12-12 00:01:39 +01001506 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001507 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001508 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509 return -1;
1510 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001511 return 0;
1512}
1513
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001514static int
1515_copy_characters(PyObject *to, Py_ssize_t to_start,
1516 PyObject *from, Py_ssize_t from_start,
1517 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 unsigned int from_kind, to_kind;
1520 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521
Victor Stinneree4544c2012-05-09 22:24:08 +02001522 assert(0 <= how_many);
1523 assert(0 <= from_start);
1524 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001525 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001526 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001527 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528
Victor Stinnerd3f08822012-05-29 12:57:52 +02001529 assert(PyUnicode_Check(to));
1530 assert(PyUnicode_IS_READY(to));
1531 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1532
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (how_many == 0)
1534 return 0;
1535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001539 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540
Victor Stinnerf1852262012-06-16 16:38:26 +02001541#ifdef Py_DEBUG
1542 if (!check_maxchar
1543 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1544 {
1545 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1546 Py_UCS4 ch;
1547 Py_ssize_t i;
1548 for (i=0; i < how_many; i++) {
1549 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1550 assert(ch <= to_maxchar);
1551 }
1552 }
1553#endif
1554
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001555 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001556 if (check_maxchar
1557 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1558 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 /* Writing Latin-1 characters into an ASCII string requires to
1560 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001561 Py_UCS4 max_char;
1562 max_char = ucs1lib_find_max_char(from_data,
1563 (Py_UCS1*)from_data + how_many);
1564 if (max_char >= 128)
1565 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001566 }
Christian Heimesf051e432016-09-13 20:22:02 +02001567 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001568 (char*)from_data + from_kind * from_start,
1569 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001571 else if (from_kind == PyUnicode_1BYTE_KIND
1572 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001573 {
1574 _PyUnicode_CONVERT_BYTES(
1575 Py_UCS1, Py_UCS2,
1576 PyUnicode_1BYTE_DATA(from) + from_start,
1577 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1578 PyUnicode_2BYTE_DATA(to) + to_start
1579 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001580 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001581 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001582 && to_kind == PyUnicode_4BYTE_KIND)
1583 {
1584 _PyUnicode_CONVERT_BYTES(
1585 Py_UCS1, Py_UCS4,
1586 PyUnicode_1BYTE_DATA(from) + from_start,
1587 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1588 PyUnicode_4BYTE_DATA(to) + to_start
1589 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001590 }
1591 else if (from_kind == PyUnicode_2BYTE_KIND
1592 && to_kind == PyUnicode_4BYTE_KIND)
1593 {
1594 _PyUnicode_CONVERT_BYTES(
1595 Py_UCS2, Py_UCS4,
1596 PyUnicode_2BYTE_DATA(from) + from_start,
1597 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1598 PyUnicode_4BYTE_DATA(to) + to_start
1599 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001600 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001601 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001602 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1603
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 if (!check_maxchar) {
1605 if (from_kind == PyUnicode_2BYTE_KIND
1606 && to_kind == PyUnicode_1BYTE_KIND)
1607 {
1608 _PyUnicode_CONVERT_BYTES(
1609 Py_UCS2, Py_UCS1,
1610 PyUnicode_2BYTE_DATA(from) + from_start,
1611 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1612 PyUnicode_1BYTE_DATA(to) + to_start
1613 );
1614 }
1615 else if (from_kind == PyUnicode_4BYTE_KIND
1616 && to_kind == PyUnicode_1BYTE_KIND)
1617 {
1618 _PyUnicode_CONVERT_BYTES(
1619 Py_UCS4, Py_UCS1,
1620 PyUnicode_4BYTE_DATA(from) + from_start,
1621 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1622 PyUnicode_1BYTE_DATA(to) + to_start
1623 );
1624 }
1625 else if (from_kind == PyUnicode_4BYTE_KIND
1626 && to_kind == PyUnicode_2BYTE_KIND)
1627 {
1628 _PyUnicode_CONVERT_BYTES(
1629 Py_UCS4, Py_UCS2,
1630 PyUnicode_4BYTE_DATA(from) + from_start,
1631 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1632 PyUnicode_2BYTE_DATA(to) + to_start
1633 );
1634 }
1635 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001636 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001637 }
1638 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001639 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001640 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001641 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001642 Py_ssize_t i;
1643
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 for (i=0; i < how_many; i++) {
1645 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001646 if (ch > to_maxchar)
1647 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1649 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001650 }
1651 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652 return 0;
1653}
1654
Victor Stinnerd3f08822012-05-29 12:57:52 +02001655void
1656_PyUnicode_FastCopyCharacters(
1657 PyObject *to, Py_ssize_t to_start,
1658 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001659{
1660 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1661}
1662
1663Py_ssize_t
1664PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1665 PyObject *from, Py_ssize_t from_start,
1666 Py_ssize_t how_many)
1667{
1668 int err;
1669
1670 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1671 PyErr_BadInternalCall();
1672 return -1;
1673 }
1674
Benjamin Petersonbac79492012-01-14 13:34:47 -05001675 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001676 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001677 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001678 return -1;
1679
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001680 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001681 PyErr_SetString(PyExc_IndexError, "string index out of range");
1682 return -1;
1683 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if (how_many < 0) {
1689 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1690 return -1;
1691 }
1692 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001693 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1694 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001695 "Cannot write %zi characters at %zi "
1696 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 how_many, to_start, PyUnicode_GET_LENGTH(to));
1698 return -1;
1699 }
1700
1701 if (how_many == 0)
1702 return 0;
1703
Victor Stinner488fa492011-12-12 00:01:39 +01001704 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 return -1;
1706
1707 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1708 if (err) {
1709 PyErr_Format(PyExc_SystemError,
1710 "Cannot copy %s characters "
1711 "into a string of %s characters",
1712 unicode_kind_name(from),
1713 unicode_kind_name(to));
1714 return -1;
1715 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001716 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717}
1718
Victor Stinner17222162011-09-28 22:15:37 +02001719/* Find the maximum code point and count the number of surrogate pairs so a
1720 correct string length can be computed before converting a string to UCS4.
1721 This function counts single surrogates as a character and not as a pair.
1722
1723 Return 0 on success, or -1 on error. */
1724static int
1725find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1726 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727{
1728 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001729 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730
Victor Stinnerc53be962011-10-02 21:33:54 +02001731 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 *num_surrogates = 0;
1733 *maxchar = 0;
1734
1735 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001737 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1738 && (iter+1) < end
1739 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1740 {
1741 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1742 ++(*num_surrogates);
1743 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 }
1745 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001747 {
1748 ch = *iter;
1749 iter++;
1750 }
1751 if (ch > *maxchar) {
1752 *maxchar = ch;
1753 if (*maxchar > MAX_UNICODE) {
1754 PyErr_Format(PyExc_ValueError,
1755 "character U+%x is not in range [U+0000; U+10ffff]",
1756 ch);
1757 return -1;
1758 }
1759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 }
1761 return 0;
1762}
1763
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001764int
1765_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766{
1767 wchar_t *end;
1768 Py_UCS4 maxchar = 0;
1769 Py_ssize_t num_surrogates;
1770#if SIZEOF_WCHAR_T == 2
1771 Py_ssize_t length_wo_surrogates;
1772#endif
1773
Georg Brandl7597add2011-10-05 16:36:47 +02001774 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001775 strings were created using _PyObject_New() and where no canonical
1776 representation (the str field) has been set yet aka strings
1777 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001778 assert(_PyUnicode_CHECK(unicode));
1779 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001781 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001782 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001783 /* Actually, it should neither be interned nor be anything else: */
1784 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001787 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790
1791 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1793 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 PyErr_NoMemory();
1795 return -1;
1796 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001797 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 _PyUnicode_WSTR(unicode), end,
1799 PyUnicode_1BYTE_DATA(unicode));
1800 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1801 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1802 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1803 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001804 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001805 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001806 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 }
1808 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001809 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8(unicode) = NULL;
1811 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 }
1813 PyObject_FREE(_PyUnicode_WSTR(unicode));
1814 _PyUnicode_WSTR(unicode) = NULL;
1815 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1816 }
1817 /* In this case we might have to convert down from 4-byte native
1818 wchar_t to 2-byte unicode. */
1819 else if (maxchar < 65536) {
1820 assert(num_surrogates == 0 &&
1821 "FindMaxCharAndNumSurrogatePairs() messed up");
1822
Victor Stinner506f5922011-09-28 22:34:18 +02001823#if SIZEOF_WCHAR_T == 2
1824 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001825 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001826 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1827 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1828 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001829 _PyUnicode_UTF8(unicode) = NULL;
1830 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001831#else
1832 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001833 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001834 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001835 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001836 PyErr_NoMemory();
1837 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 }
Victor Stinner506f5922011-09-28 22:34:18 +02001839 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1840 _PyUnicode_WSTR(unicode), end,
1841 PyUnicode_2BYTE_DATA(unicode));
1842 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1843 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1844 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001845 _PyUnicode_UTF8(unicode) = NULL;
1846 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001847 PyObject_FREE(_PyUnicode_WSTR(unicode));
1848 _PyUnicode_WSTR(unicode) = NULL;
1849 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1850#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 }
1852 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1853 else {
1854#if SIZEOF_WCHAR_T == 2
1855 /* in case the native representation is 2-bytes, we need to allocate a
1856 new normalized 4-byte version. */
1857 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001858 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1859 PyErr_NoMemory();
1860 return -1;
1861 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1863 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 PyErr_NoMemory();
1865 return -1;
1866 }
1867 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1868 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001869 _PyUnicode_UTF8(unicode) = NULL;
1870 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001871 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1872 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001873 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 PyObject_FREE(_PyUnicode_WSTR(unicode));
1875 _PyUnicode_WSTR(unicode) = NULL;
1876 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1877#else
1878 assert(num_surrogates == 0);
1879
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1885#endif
1886 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1887 }
1888 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001889 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 return 0;
1891}
1892
Alexander Belopolsky40018472011-02-26 01:02:56 +00001893static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001894unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895{
Walter Dörwald16807132007-05-25 13:52:07 +00001896 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 case SSTATE_NOT_INTERNED:
1898 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001899
Benjamin Peterson29060642009-01-31 22:14:21 +00001900 case SSTATE_INTERNED_MORTAL:
1901 /* revive dead object temporarily for DelItem */
1902 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001903 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 Py_FatalError(
1905 "deletion of interned string failed");
1906 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001907
Benjamin Peterson29060642009-01-31 22:14:21 +00001908 case SSTATE_INTERNED_IMMORTAL:
1909 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001910 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001911
Benjamin Peterson29060642009-01-31 22:14:21 +00001912 default:
1913 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001914 }
1915
Victor Stinner03490912011-10-03 23:45:12 +02001916 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001917 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001918 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001919 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001920 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1921 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001923 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924}
1925
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001926#ifdef Py_DEBUG
1927static int
1928unicode_is_singleton(PyObject *unicode)
1929{
1930 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1931 if (unicode == unicode_empty)
1932 return 1;
1933 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1934 {
1935 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1936 if (ch < 256 && unicode_latin1[ch] == unicode)
1937 return 1;
1938 }
1939 return 0;
1940}
1941#endif
1942
Alexander Belopolsky40018472011-02-26 01:02:56 +00001943static int
Victor Stinner488fa492011-12-12 00:01:39 +01001944unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001945{
Victor Stinner488fa492011-12-12 00:01:39 +01001946 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001947 if (Py_REFCNT(unicode) != 1)
1948 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001949 if (_PyUnicode_HASH(unicode) != -1)
1950 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001951 if (PyUnicode_CHECK_INTERNED(unicode))
1952 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001953 if (!PyUnicode_CheckExact(unicode))
1954 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001955#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001956 /* singleton refcount is greater than 1 */
1957 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001958#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001959 return 1;
1960}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001961
Victor Stinnerfe226c02011-10-03 03:52:20 +02001962static int
1963unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1964{
1965 PyObject *unicode;
1966 Py_ssize_t old_length;
1967
1968 assert(p_unicode != NULL);
1969 unicode = *p_unicode;
1970
1971 assert(unicode != NULL);
1972 assert(PyUnicode_Check(unicode));
1973 assert(0 <= length);
1974
Victor Stinner910337b2011-10-03 03:20:16 +02001975 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001976 old_length = PyUnicode_WSTR_LENGTH(unicode);
1977 else
1978 old_length = PyUnicode_GET_LENGTH(unicode);
1979 if (old_length == length)
1980 return 0;
1981
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001982 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001983 _Py_INCREF_UNICODE_EMPTY();
1984 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001985 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001986 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001987 return 0;
1988 }
1989
Victor Stinner488fa492011-12-12 00:01:39 +01001990 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001991 PyObject *copy = resize_copy(unicode, length);
1992 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001994 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001995 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001996 }
1997
Victor Stinnerfe226c02011-10-03 03:52:20 +02001998 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001999 PyObject *new_unicode = resize_compact(unicode, length);
2000 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002002 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002003 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002004 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002005 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002006}
2007
Alexander Belopolsky40018472011-02-26 01:02:56 +00002008int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002010{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 PyObject *unicode;
2012 if (p_unicode == NULL) {
2013 PyErr_BadInternalCall();
2014 return -1;
2015 }
2016 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002017 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002018 {
2019 PyErr_BadInternalCall();
2020 return -1;
2021 }
2022 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002023}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002024
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002025/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002026
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002027 WARNING: The function doesn't copy the terminating null character and
2028 doesn't check the maximum character (may write a latin1 character in an
2029 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002030static void
2031unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2032 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002033{
2034 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2035 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002036 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002037
2038 switch (kind) {
2039 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002040 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002041#ifdef Py_DEBUG
2042 if (PyUnicode_IS_ASCII(unicode)) {
2043 Py_UCS4 maxchar = ucs1lib_find_max_char(
2044 (const Py_UCS1*)str,
2045 (const Py_UCS1*)str + len);
2046 assert(maxchar < 128);
2047 }
2048#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002049 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002050 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002051 }
2052 case PyUnicode_2BYTE_KIND: {
2053 Py_UCS2 *start = (Py_UCS2 *)data + index;
2054 Py_UCS2 *ucs2 = start;
2055 assert(index <= PyUnicode_GET_LENGTH(unicode));
2056
Victor Stinner184252a2012-06-16 02:57:41 +02002057 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002058 *ucs2 = (Py_UCS2)*str;
2059
2060 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002061 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002062 }
2063 default: {
2064 Py_UCS4 *start = (Py_UCS4 *)data + index;
2065 Py_UCS4 *ucs4 = start;
2066 assert(kind == PyUnicode_4BYTE_KIND);
2067 assert(index <= PyUnicode_GET_LENGTH(unicode));
2068
Victor Stinner184252a2012-06-16 02:57:41 +02002069 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002070 *ucs4 = (Py_UCS4)*str;
2071
2072 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002073 }
2074 }
2075}
2076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077static PyObject*
2078get_latin1_char(unsigned char ch)
2079{
Victor Stinnera464fc12011-10-02 20:39:30 +02002080 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002082 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (!unicode)
2084 return NULL;
2085 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002086 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 unicode_latin1[ch] = unicode;
2088 }
2089 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002090 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091}
2092
Victor Stinner985a82a2014-01-03 12:53:47 +01002093static PyObject*
2094unicode_char(Py_UCS4 ch)
2095{
2096 PyObject *unicode;
2097
2098 assert(ch <= MAX_UNICODE);
2099
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002100 if (ch < 256)
2101 return get_latin1_char(ch);
2102
Victor Stinner985a82a2014-01-03 12:53:47 +01002103 unicode = PyUnicode_New(1, ch);
2104 if (unicode == NULL)
2105 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002106
2107 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2108 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002109 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002110 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002111 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2112 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2113 }
2114 assert(_PyUnicode_CheckConsistency(unicode, 1));
2115 return unicode;
2116}
2117
Alexander Belopolsky40018472011-02-26 01:02:56 +00002118PyObject *
2119PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002121 if (u == NULL)
2122 return (PyObject*)_PyUnicode_New(size);
2123
2124 if (size < 0) {
2125 PyErr_BadInternalCall();
2126 return NULL;
2127 }
2128
2129 return PyUnicode_FromWideChar(u, size);
2130}
2131
2132PyObject *
2133PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2134{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002135 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 Py_UCS4 maxchar = 0;
2137 Py_ssize_t num_surrogates;
2138
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002139 if (u == NULL && size != 0) {
2140 PyErr_BadInternalCall();
2141 return NULL;
2142 }
2143
2144 if (size == -1) {
2145 size = wcslen(u);
2146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002148 /* If the Unicode data is known at construction time, we can apply
2149 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002152 if (size == 0)
2153 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 /* Single character Unicode objects in the Latin-1 range are
2156 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002157 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002158 return get_latin1_char((unsigned char)*u);
2159
2160 /* If not empty and not single character, copy the Unicode data
2161 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002162 if (find_maxchar_surrogates(u, u + size,
2163 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 return NULL;
2165
Victor Stinner8faf8212011-12-08 22:14:11 +01002166 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 if (!unicode)
2168 return NULL;
2169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 switch (PyUnicode_KIND(unicode)) {
2171 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002172 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2174 break;
2175 case PyUnicode_2BYTE_KIND:
2176#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002177 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002179 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2181#endif
2182 break;
2183 case PyUnicode_4BYTE_KIND:
2184#if SIZEOF_WCHAR_T == 2
2185 /* This is the only case which has to process surrogates, thus
2186 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002187 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188#else
2189 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002190 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191#endif
2192 break;
2193 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002194 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002197 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198}
2199
Alexander Belopolsky40018472011-02-26 01:02:56 +00002200PyObject *
2201PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002202{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 if (size < 0) {
2204 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002205 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 return NULL;
2207 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002208 if (u != NULL)
2209 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2210 else
2211 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002212}
2213
Alexander Belopolsky40018472011-02-26 01:02:56 +00002214PyObject *
2215PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002216{
2217 size_t size = strlen(u);
2218 if (size > PY_SSIZE_T_MAX) {
2219 PyErr_SetString(PyExc_OverflowError, "input too long");
2220 return NULL;
2221 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002222 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002223}
2224
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002225PyObject *
2226_PyUnicode_FromId(_Py_Identifier *id)
2227{
2228 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002229 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2230 strlen(id->string),
2231 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002232 if (!id->object)
2233 return NULL;
2234 PyUnicode_InternInPlace(&id->object);
2235 assert(!id->next);
2236 id->next = static_strings;
2237 static_strings = id;
2238 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002239 return id->object;
2240}
2241
2242void
2243_PyUnicode_ClearStaticStrings()
2244{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002245 _Py_Identifier *tmp, *s = static_strings;
2246 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002247 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002248 tmp = s->next;
2249 s->next = NULL;
2250 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002251 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002252 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002253}
2254
Benjamin Peterson0df54292012-03-26 14:50:32 -04002255/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002256
Victor Stinnerd3f08822012-05-29 12:57:52 +02002257PyObject*
2258_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002259{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002260 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002261 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002262 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002263#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002264 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002265#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002266 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002267 }
Victor Stinner785938e2011-12-11 20:09:03 +01002268 unicode = PyUnicode_New(size, 127);
2269 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002270 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002271 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2272 assert(_PyUnicode_CheckConsistency(unicode, 1));
2273 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002274}
2275
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002276static Py_UCS4
2277kind_maxchar_limit(unsigned int kind)
2278{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002279 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002280 case PyUnicode_1BYTE_KIND:
2281 return 0x80;
2282 case PyUnicode_2BYTE_KIND:
2283 return 0x100;
2284 case PyUnicode_4BYTE_KIND:
2285 return 0x10000;
2286 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002287 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002288 }
2289}
2290
Victor Stinner702c7342011-10-05 13:50:52 +02002291static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002292_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002296
Serhiy Storchaka678db842013-01-26 12:16:36 +02002297 if (size == 0)
2298 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002299 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002300 if (size == 1)
2301 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002302
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002303 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002304 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002305 if (!res)
2306 return NULL;
2307 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002308 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002310}
2311
Victor Stinnere57b1c02011-09-28 22:20:48 +02002312static PyObject*
2313_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314{
2315 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002316 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002317
Serhiy Storchaka678db842013-01-26 12:16:36 +02002318 if (size == 0)
2319 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002320 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002321 if (size == 1)
2322 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002323
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002324 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002325 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002326 if (!res)
2327 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002328 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002330 else {
2331 _PyUnicode_CONVERT_BYTES(
2332 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2333 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002334 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002335 return res;
2336}
2337
Victor Stinnere57b1c02011-09-28 22:20:48 +02002338static PyObject*
2339_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002340{
2341 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002342 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002343
Serhiy Storchaka678db842013-01-26 12:16:36 +02002344 if (size == 0)
2345 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002346 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002347 if (size == 1)
2348 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002349
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002350 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002352 if (!res)
2353 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002354 if (max_char < 256)
2355 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2356 PyUnicode_1BYTE_DATA(res));
2357 else if (max_char < 0x10000)
2358 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2359 PyUnicode_2BYTE_DATA(res));
2360 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002361 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002362 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002363 return res;
2364}
2365
2366PyObject*
2367PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2368{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002369 if (size < 0) {
2370 PyErr_SetString(PyExc_ValueError, "size must be positive");
2371 return NULL;
2372 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002373 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002374 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002375 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002377 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002379 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002380 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002381 PyErr_SetString(PyExc_SystemError, "invalid kind");
2382 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384}
2385
Victor Stinnerece58de2012-04-23 23:36:38 +02002386Py_UCS4
2387_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2388{
2389 enum PyUnicode_Kind kind;
2390 void *startptr, *endptr;
2391
2392 assert(PyUnicode_IS_READY(unicode));
2393 assert(0 <= start);
2394 assert(end <= PyUnicode_GET_LENGTH(unicode));
2395 assert(start <= end);
2396
2397 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2398 return PyUnicode_MAX_CHAR_VALUE(unicode);
2399
2400 if (start == end)
2401 return 127;
2402
Victor Stinner94d558b2012-04-27 22:26:58 +02002403 if (PyUnicode_IS_ASCII(unicode))
2404 return 127;
2405
Victor Stinnerece58de2012-04-23 23:36:38 +02002406 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002407 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002408 endptr = (char *)startptr + end * kind;
2409 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002410 switch(kind) {
2411 case PyUnicode_1BYTE_KIND:
2412 return ucs1lib_find_max_char(startptr, endptr);
2413 case PyUnicode_2BYTE_KIND:
2414 return ucs2lib_find_max_char(startptr, endptr);
2415 case PyUnicode_4BYTE_KIND:
2416 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002417 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002418 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002419 }
2420}
2421
Victor Stinner25a4b292011-10-06 12:31:55 +02002422/* Ensure that a string uses the most efficient storage, if it is not the
2423 case: create a new string with of the right kind. Write NULL into *p_unicode
2424 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002425static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002426unicode_adjust_maxchar(PyObject **p_unicode)
2427{
2428 PyObject *unicode, *copy;
2429 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002430 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002431 unsigned int kind;
2432
2433 assert(p_unicode != NULL);
2434 unicode = *p_unicode;
2435 assert(PyUnicode_IS_READY(unicode));
2436 if (PyUnicode_IS_ASCII(unicode))
2437 return;
2438
2439 len = PyUnicode_GET_LENGTH(unicode);
2440 kind = PyUnicode_KIND(unicode);
2441 if (kind == PyUnicode_1BYTE_KIND) {
2442 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002443 max_char = ucs1lib_find_max_char(u, u + len);
2444 if (max_char >= 128)
2445 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002446 }
2447 else if (kind == PyUnicode_2BYTE_KIND) {
2448 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002449 max_char = ucs2lib_find_max_char(u, u + len);
2450 if (max_char >= 256)
2451 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002452 }
2453 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002454 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002455 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002456 max_char = ucs4lib_find_max_char(u, u + len);
2457 if (max_char >= 0x10000)
2458 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002459 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002461 if (copy != NULL)
2462 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 Py_DECREF(unicode);
2464 *p_unicode = copy;
2465}
2466
Victor Stinner034f6cf2011-09-30 02:26:44 +02002467PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002468_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002469{
Victor Stinner87af4f22011-11-21 23:03:47 +01002470 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002471 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002472
Victor Stinner034f6cf2011-09-30 02:26:44 +02002473 if (!PyUnicode_Check(unicode)) {
2474 PyErr_BadInternalCall();
2475 return NULL;
2476 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002477 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002478 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002479
Victor Stinner87af4f22011-11-21 23:03:47 +01002480 length = PyUnicode_GET_LENGTH(unicode);
2481 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002482 if (!copy)
2483 return NULL;
2484 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2485
Christian Heimesf051e432016-09-13 20:22:02 +02002486 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002487 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002488 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002489 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002490}
2491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492
Victor Stinnerbc603d12011-10-02 01:00:40 +02002493/* Widen Unicode objects to larger buffers. Don't write terminating null
2494 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495
2496void*
2497_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2498{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002499 Py_ssize_t len;
2500 void *result;
2501 unsigned int skind;
2502
Benjamin Petersonbac79492012-01-14 13:34:47 -05002503 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002504 return NULL;
2505
2506 len = PyUnicode_GET_LENGTH(s);
2507 skind = PyUnicode_KIND(s);
2508 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002509 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 return NULL;
2511 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002512 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002513 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002514 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002515 if (!result)
2516 return PyErr_NoMemory();
2517 assert(skind == PyUnicode_1BYTE_KIND);
2518 _PyUnicode_CONVERT_BYTES(
2519 Py_UCS1, Py_UCS2,
2520 PyUnicode_1BYTE_DATA(s),
2521 PyUnicode_1BYTE_DATA(s) + len,
2522 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002524 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002525 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002526 if (!result)
2527 return PyErr_NoMemory();
2528 if (skind == PyUnicode_2BYTE_KIND) {
2529 _PyUnicode_CONVERT_BYTES(
2530 Py_UCS2, Py_UCS4,
2531 PyUnicode_2BYTE_DATA(s),
2532 PyUnicode_2BYTE_DATA(s) + len,
2533 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002535 else {
2536 assert(skind == PyUnicode_1BYTE_KIND);
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS1, Py_UCS4,
2539 PyUnicode_1BYTE_DATA(s),
2540 PyUnicode_1BYTE_DATA(s) + len,
2541 result);
2542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002544 default:
2545 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002546 }
Victor Stinner01698042011-10-04 00:04:26 +02002547 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 return NULL;
2549}
2550
2551static Py_UCS4*
2552as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2553 int copy_null)
2554{
2555 int kind;
2556 void *data;
2557 Py_ssize_t len, targetlen;
2558 if (PyUnicode_READY(string) == -1)
2559 return NULL;
2560 kind = PyUnicode_KIND(string);
2561 data = PyUnicode_DATA(string);
2562 len = PyUnicode_GET_LENGTH(string);
2563 targetlen = len;
2564 if (copy_null)
2565 targetlen++;
2566 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002567 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 if (!target) {
2569 PyErr_NoMemory();
2570 return NULL;
2571 }
2572 }
2573 else {
2574 if (targetsize < targetlen) {
2575 PyErr_Format(PyExc_SystemError,
2576 "string is longer than the buffer");
2577 if (copy_null && 0 < targetsize)
2578 target[0] = 0;
2579 return NULL;
2580 }
2581 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002582 if (kind == PyUnicode_1BYTE_KIND) {
2583 Py_UCS1 *start = (Py_UCS1 *) data;
2584 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002586 else if (kind == PyUnicode_2BYTE_KIND) {
2587 Py_UCS2 *start = (Py_UCS2 *) data;
2588 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2589 }
2590 else {
2591 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002592 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 if (copy_null)
2595 target[len] = 0;
2596 return target;
2597}
2598
2599Py_UCS4*
2600PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2601 int copy_null)
2602{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002603 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 PyErr_BadInternalCall();
2605 return NULL;
2606 }
2607 return as_ucs4(string, target, targetsize, copy_null);
2608}
2609
2610Py_UCS4*
2611PyUnicode_AsUCS4Copy(PyObject *string)
2612{
2613 return as_ucs4(string, NULL, 0, 1);
2614}
2615
Victor Stinner15a11362012-10-06 23:48:20 +02002616/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002617 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2618 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2619#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002620
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002621static int
2622unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2623 Py_ssize_t width, Py_ssize_t precision)
2624{
2625 Py_ssize_t length, fill, arglen;
2626 Py_UCS4 maxchar;
2627
2628 if (PyUnicode_READY(str) == -1)
2629 return -1;
2630
2631 length = PyUnicode_GET_LENGTH(str);
2632 if ((precision == -1 || precision >= length)
2633 && width <= length)
2634 return _PyUnicodeWriter_WriteStr(writer, str);
2635
2636 if (precision != -1)
2637 length = Py_MIN(precision, length);
2638
2639 arglen = Py_MAX(length, width);
2640 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2641 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2642 else
2643 maxchar = writer->maxchar;
2644
2645 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2646 return -1;
2647
2648 if (width > length) {
2649 fill = width - length;
2650 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2651 return -1;
2652 writer->pos += fill;
2653 }
2654
2655 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2656 str, 0, length);
2657 writer->pos += length;
2658 return 0;
2659}
2660
2661static int
Victor Stinner998b8062018-09-12 00:23:25 +02002662unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002663 Py_ssize_t width, Py_ssize_t precision)
2664{
2665 /* UTF-8 */
2666 Py_ssize_t length;
2667 PyObject *unicode;
2668 int res;
2669
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002670 if (precision == -1) {
2671 length = strlen(str);
2672 }
2673 else {
2674 length = 0;
2675 while (length < precision && str[length]) {
2676 length++;
2677 }
2678 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002679 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2680 if (unicode == NULL)
2681 return -1;
2682
2683 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2684 Py_DECREF(unicode);
2685 return res;
2686}
2687
Victor Stinner96865452011-03-01 23:44:09 +00002688static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002689unicode_fromformat_arg(_PyUnicodeWriter *writer,
2690 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002691{
Victor Stinnere215d962012-10-06 23:03:36 +02002692 const char *p;
2693 Py_ssize_t len;
2694 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002695 Py_ssize_t width;
2696 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002697 int longflag;
2698 int longlongflag;
2699 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002700 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002701
2702 p = f;
2703 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002704 zeropad = 0;
2705 if (*f == '0') {
2706 zeropad = 1;
2707 f++;
2708 }
Victor Stinner96865452011-03-01 23:44:09 +00002709
2710 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002711 width = -1;
2712 if (Py_ISDIGIT((unsigned)*f)) {
2713 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002714 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002715 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002716 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002717 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002718 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002719 return NULL;
2720 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002721 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002722 f++;
2723 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002724 }
2725 precision = -1;
2726 if (*f == '.') {
2727 f++;
2728 if (Py_ISDIGIT((unsigned)*f)) {
2729 precision = (*f - '0');
2730 f++;
2731 while (Py_ISDIGIT((unsigned)*f)) {
2732 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2733 PyErr_SetString(PyExc_ValueError,
2734 "precision too big");
2735 return NULL;
2736 }
2737 precision = (precision * 10) + (*f - '0');
2738 f++;
2739 }
2740 }
Victor Stinner96865452011-03-01 23:44:09 +00002741 if (*f == '%') {
2742 /* "%.3%s" => f points to "3" */
2743 f--;
2744 }
2745 }
2746 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002747 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002748 f--;
2749 }
Victor Stinner96865452011-03-01 23:44:09 +00002750
2751 /* Handle %ld, %lu, %lld and %llu. */
2752 longflag = 0;
2753 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002754 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002757 longflag = 1;
2758 ++f;
2759 }
Victor Stinner96865452011-03-01 23:44:09 +00002760 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002761 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002762 longlongflag = 1;
2763 f += 2;
2764 }
Victor Stinner96865452011-03-01 23:44:09 +00002765 }
2766 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002767 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002768 size_tflag = 1;
2769 ++f;
2770 }
Victor Stinnere215d962012-10-06 23:03:36 +02002771
2772 if (f[1] == '\0')
2773 writer->overallocate = 0;
2774
2775 switch (*f) {
2776 case 'c':
2777 {
2778 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002779 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002780 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002781 "character argument not in range(0x110000)");
2782 return NULL;
2783 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002784 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002785 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002786 break;
2787 }
2788
2789 case 'i':
2790 case 'd':
2791 case 'u':
2792 case 'x':
2793 {
2794 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002795 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002796 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002797
2798 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002799 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002800 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002801 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002802 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002803 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002804 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002805 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002806 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002807 va_arg(*vargs, size_t));
2808 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002809 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002810 va_arg(*vargs, unsigned int));
2811 }
2812 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002813 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002814 }
2815 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002816 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002817 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002818 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002819 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002820 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002821 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002822 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002823 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002824 va_arg(*vargs, Py_ssize_t));
2825 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002826 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002827 va_arg(*vargs, int));
2828 }
2829 assert(len >= 0);
2830
Victor Stinnere215d962012-10-06 23:03:36 +02002831 if (precision < len)
2832 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002833
2834 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2836 return NULL;
2837
Victor Stinnere215d962012-10-06 23:03:36 +02002838 if (width > precision) {
2839 Py_UCS4 fillchar;
2840 fill = width - precision;
2841 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002842 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2843 return NULL;
2844 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002845 }
Victor Stinner15a11362012-10-06 23:48:20 +02002846 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002847 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002848 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2849 return NULL;
2850 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002851 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002852
Victor Stinner4a587072013-11-19 12:54:53 +01002853 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2854 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002855 break;
2856 }
2857
2858 case 'p':
2859 {
2860 char number[MAX_LONG_LONG_CHARS];
2861
2862 len = sprintf(number, "%p", va_arg(*vargs, void*));
2863 assert(len >= 0);
2864
2865 /* %p is ill-defined: ensure leading 0x. */
2866 if (number[1] == 'X')
2867 number[1] = 'x';
2868 else if (number[1] != 'x') {
2869 memmove(number + 2, number,
2870 strlen(number) + 1);
2871 number[0] = '0';
2872 number[1] = 'x';
2873 len += 2;
2874 }
2875
Victor Stinner4a587072013-11-19 12:54:53 +01002876 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002877 return NULL;
2878 break;
2879 }
2880
2881 case 's':
2882 {
2883 /* UTF-8 */
2884 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002885 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002886 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002887 break;
2888 }
2889
2890 case 'U':
2891 {
2892 PyObject *obj = va_arg(*vargs, PyObject *);
2893 assert(obj && _PyUnicode_CHECK(obj));
2894
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002895 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002896 return NULL;
2897 break;
2898 }
2899
2900 case 'V':
2901 {
2902 PyObject *obj = va_arg(*vargs, PyObject *);
2903 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002904 if (obj) {
2905 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002906 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002907 return NULL;
2908 }
2909 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002910 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002911 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002912 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002913 }
2914 break;
2915 }
2916
2917 case 'S':
2918 {
2919 PyObject *obj = va_arg(*vargs, PyObject *);
2920 PyObject *str;
2921 assert(obj);
2922 str = PyObject_Str(obj);
2923 if (!str)
2924 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002925 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002926 Py_DECREF(str);
2927 return NULL;
2928 }
2929 Py_DECREF(str);
2930 break;
2931 }
2932
2933 case 'R':
2934 {
2935 PyObject *obj = va_arg(*vargs, PyObject *);
2936 PyObject *repr;
2937 assert(obj);
2938 repr = PyObject_Repr(obj);
2939 if (!repr)
2940 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002941 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002942 Py_DECREF(repr);
2943 return NULL;
2944 }
2945 Py_DECREF(repr);
2946 break;
2947 }
2948
2949 case 'A':
2950 {
2951 PyObject *obj = va_arg(*vargs, PyObject *);
2952 PyObject *ascii;
2953 assert(obj);
2954 ascii = PyObject_ASCII(obj);
2955 if (!ascii)
2956 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002957 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002958 Py_DECREF(ascii);
2959 return NULL;
2960 }
2961 Py_DECREF(ascii);
2962 break;
2963 }
2964
2965 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002966 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002967 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 break;
2969
2970 default:
2971 /* if we stumble upon an unknown formatting code, copy the rest
2972 of the format string to the output string. (we cannot just
2973 skip the code, since there's no way to know what's in the
2974 argument list) */
2975 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002976 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002977 return NULL;
2978 f = p+len;
2979 return f;
2980 }
2981
2982 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002983 return f;
2984}
2985
Walter Dörwaldd2034312007-05-18 16:29:38 +00002986PyObject *
2987PyUnicode_FromFormatV(const char *format, va_list vargs)
2988{
Victor Stinnere215d962012-10-06 23:03:36 +02002989 va_list vargs2;
2990 const char *f;
2991 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992
Victor Stinner8f674cc2013-04-17 23:02:17 +02002993 _PyUnicodeWriter_Init(&writer);
2994 writer.min_length = strlen(format) + 100;
2995 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002996
Benjamin Peterson0c212142016-09-20 20:39:33 -07002997 // Copy varags to be able to pass a reference to a subfunction.
2998 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002999
3000 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003002 f = unicode_fromformat_arg(&writer, f, &vargs2);
3003 if (f == NULL)
3004 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003006 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003007 const char *p;
3008 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003009
Victor Stinnere215d962012-10-06 23:03:36 +02003010 p = f;
3011 do
3012 {
3013 if ((unsigned char)*p > 127) {
3014 PyErr_Format(PyExc_ValueError,
3015 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3016 "string, got a non-ASCII byte: 0x%02x",
3017 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003018 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003019 }
3020 p++;
3021 }
3022 while (*p != '\0' && *p != '%');
3023 len = p - f;
3024
3025 if (*p == '\0')
3026 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003027
3028 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003029 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003030
3031 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003032 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003033 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003034 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003035 return _PyUnicodeWriter_Finish(&writer);
3036
3037 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003038 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003039 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003040 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003041}
3042
Walter Dörwaldd2034312007-05-18 16:29:38 +00003043PyObject *
3044PyUnicode_FromFormat(const char *format, ...)
3045{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003046 PyObject* ret;
3047 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003048
3049#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003050 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003051#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003052 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003053#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003054 ret = PyUnicode_FromFormatV(format, vargs);
3055 va_end(vargs);
3056 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003057}
3058
Serhiy Storchakac46db922018-10-23 22:58:24 +03003059static Py_ssize_t
3060unicode_get_widechar_size(PyObject *unicode)
3061{
3062 Py_ssize_t res;
3063
3064 assert(unicode != NULL);
3065 assert(_PyUnicode_CHECK(unicode));
3066
3067 if (_PyUnicode_WSTR(unicode) != NULL) {
3068 return PyUnicode_WSTR_LENGTH(unicode);
3069 }
3070 assert(PyUnicode_IS_READY(unicode));
3071
3072 res = _PyUnicode_LENGTH(unicode);
3073#if SIZEOF_WCHAR_T == 2
3074 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3075 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3076 const Py_UCS4 *end = s + res;
3077 for (; s < end; ++s) {
3078 if (*s > 0xFFFF) {
3079 ++res;
3080 }
3081 }
3082 }
3083#endif
3084 return res;
3085}
3086
3087static void
3088unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3089{
3090 const wchar_t *wstr;
3091
3092 assert(unicode != NULL);
3093 assert(_PyUnicode_CHECK(unicode));
3094
3095 wstr = _PyUnicode_WSTR(unicode);
3096 if (wstr != NULL) {
3097 memcpy(w, wstr, size * sizeof(wchar_t));
3098 return;
3099 }
3100 assert(PyUnicode_IS_READY(unicode));
3101
3102 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3103 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3104 for (; size--; ++s, ++w) {
3105 *w = *s;
3106 }
3107 }
3108 else {
3109#if SIZEOF_WCHAR_T == 4
3110 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3111 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3112 for (; size--; ++s, ++w) {
3113 *w = *s;
3114 }
3115#else
3116 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3117 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3118 for (; size--; ++s, ++w) {
3119 Py_UCS4 ch = *s;
3120 if (ch > 0xFFFF) {
3121 assert(ch <= MAX_UNICODE);
3122 /* encode surrogate pair in this case */
3123 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3124 if (!size--)
3125 break;
3126 *w = Py_UNICODE_LOW_SURROGATE(ch);
3127 }
3128 else {
3129 *w = ch;
3130 }
3131 }
3132#endif
3133 }
3134}
3135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003136#ifdef HAVE_WCHAR_H
3137
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003138/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003139
Victor Stinnerd88d9832011-09-06 02:00:05 +02003140 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003141 character) required to convert the unicode object. Ignore size argument.
3142
Victor Stinnerd88d9832011-09-06 02:00:05 +02003143 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003144 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003145 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146Py_ssize_t
3147PyUnicode_AsWideChar(PyObject *unicode,
3148 wchar_t *w,
3149 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003150{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003151 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003153 if (unicode == NULL) {
3154 PyErr_BadInternalCall();
3155 return -1;
3156 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003157 if (!PyUnicode_Check(unicode)) {
3158 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003159 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003160 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003161
3162 res = unicode_get_widechar_size(unicode);
3163 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003164 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003165 }
3166
3167 if (size > res) {
3168 size = res + 1;
3169 }
3170 else {
3171 res = size;
3172 }
3173 unicode_copy_as_widechar(unicode, w, size);
3174 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003175}
3176
Victor Stinner137c34c2010-09-29 10:25:54 +00003177wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003178PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003179 Py_ssize_t *size)
3180{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003181 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003182 Py_ssize_t buflen;
3183
3184 if (unicode == NULL) {
3185 PyErr_BadInternalCall();
3186 return NULL;
3187 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003188 if (!PyUnicode_Check(unicode)) {
3189 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003190 return NULL;
3191 }
3192
Serhiy Storchakac46db922018-10-23 22:58:24 +03003193 buflen = unicode_get_widechar_size(unicode);
3194 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003195 if (buffer == NULL) {
3196 PyErr_NoMemory();
3197 return NULL;
3198 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003199 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3200 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003201 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003202 }
3203 else if (wcslen(buffer) != (size_t)buflen) {
3204 PyMem_FREE(buffer);
3205 PyErr_SetString(PyExc_ValueError,
3206 "embedded null character");
3207 return NULL;
3208 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003209 return buffer;
3210}
3211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003212#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213
Alexander Belopolsky40018472011-02-26 01:02:56 +00003214PyObject *
3215PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003216{
Victor Stinner8faf8212011-12-08 22:14:11 +01003217 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 PyErr_SetString(PyExc_ValueError,
3219 "chr() arg not in range(0x110000)");
3220 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003221 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003222
Victor Stinner985a82a2014-01-03 12:53:47 +01003223 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003224}
3225
Alexander Belopolsky40018472011-02-26 01:02:56 +00003226PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003227PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003229 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003231 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003232 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003233 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 Py_INCREF(obj);
3235 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003236 }
3237 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 /* For a Unicode subtype that's not a Unicode object,
3239 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003240 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003241 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003242 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003243 "Can't convert '%.100s' object to str implicitly",
3244 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003245 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003246}
3247
Alexander Belopolsky40018472011-02-26 01:02:56 +00003248PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003249PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003250 const char *encoding,
3251 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003252{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003253 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003254 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003255
Guido van Rossumd57fd912000-03-10 22:53:23 +00003256 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003257 PyErr_BadInternalCall();
3258 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003260
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003261 /* Decoding bytes objects is the most common case and should be fast */
3262 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003263 if (PyBytes_GET_SIZE(obj) == 0) {
3264 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3265 return NULL;
3266 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003267 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003268 }
3269 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003270 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3271 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003272 }
3273
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003274 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 PyErr_SetString(PyExc_TypeError,
3276 "decoding str is not supported");
3277 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003278 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003279
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003280 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3281 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3282 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003283 "decoding to str: need a bytes-like object, %.80s found",
3284 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003285 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003286 }
Tim Petersced69f82003-09-16 20:30:58 +00003287
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003288 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003289 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003290 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3291 return NULL;
3292 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003293 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003295
Serhiy Storchaka05997252013-01-26 12:14:02 +02003296 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003297 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003298 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299}
3300
Victor Stinnerebe17e02016-10-12 13:57:45 +02003301/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3302 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3303 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003304int
3305_Py_normalize_encoding(const char *encoding,
3306 char *lower,
3307 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003309 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003310 char *l;
3311 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003312 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003313
Victor Stinner942889a2016-09-05 15:40:10 -07003314 assert(encoding != NULL);
3315
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003316 e = encoding;
3317 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003319 punct = 0;
3320 while (1) {
3321 char c = *e;
3322 if (c == 0) {
3323 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003324 }
Victor Stinner942889a2016-09-05 15:40:10 -07003325
3326 if (Py_ISALNUM(c) || c == '.') {
3327 if (punct && l != lower) {
3328 if (l == l_end) {
3329 return 0;
3330 }
3331 *l++ = '_';
3332 }
3333 punct = 0;
3334
3335 if (l == l_end) {
3336 return 0;
3337 }
3338 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003339 }
3340 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003341 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003342 }
Victor Stinner942889a2016-09-05 15:40:10 -07003343
3344 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003345 }
3346 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003347 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003348}
3349
Alexander Belopolsky40018472011-02-26 01:02:56 +00003350PyObject *
3351PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003352 Py_ssize_t size,
3353 const char *encoding,
3354 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003355{
3356 PyObject *buffer = NULL, *unicode;
3357 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003358 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3359
Victor Stinner22eb6892019-06-26 00:51:05 +02003360 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3361 return NULL;
3362 }
3363
Victor Stinnered076ed2019-06-26 01:49:32 +02003364 if (size == 0) {
3365 _Py_RETURN_UNICODE_EMPTY();
3366 }
3367
Victor Stinner942889a2016-09-05 15:40:10 -07003368 if (encoding == NULL) {
3369 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3370 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003371
Fred Drakee4315f52000-05-09 19:53:39 +00003372 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003373 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3374 char *lower = buflower;
3375
3376 /* Fast paths */
3377 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3378 lower += 3;
3379 if (*lower == '_') {
3380 /* Match "utf8" and "utf_8" */
3381 lower++;
3382 }
3383
3384 if (lower[0] == '8' && lower[1] == 0) {
3385 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3386 }
3387 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3388 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3389 }
3390 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3391 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3392 }
3393 }
3394 else {
3395 if (strcmp(lower, "ascii") == 0
3396 || strcmp(lower, "us_ascii") == 0) {
3397 return PyUnicode_DecodeASCII(s, size, errors);
3398 }
Steve Dowercc16be82016-09-08 10:35:16 -07003399 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003400 else if (strcmp(lower, "mbcs") == 0) {
3401 return PyUnicode_DecodeMBCS(s, size, errors);
3402 }
3403 #endif
3404 else if (strcmp(lower, "latin1") == 0
3405 || strcmp(lower, "latin_1") == 0
3406 || strcmp(lower, "iso_8859_1") == 0
3407 || strcmp(lower, "iso8859_1") == 0) {
3408 return PyUnicode_DecodeLatin1(s, size, errors);
3409 }
3410 }
Victor Stinner37296e82010-06-10 13:36:23 +00003411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003412
3413 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003414 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003415 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003416 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003417 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418 if (buffer == NULL)
3419 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003420 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 if (unicode == NULL)
3422 goto onError;
3423 if (!PyUnicode_Check(unicode)) {
3424 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003425 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003426 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003427 encoding,
3428 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 Py_DECREF(unicode);
3430 goto onError;
3431 }
3432 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003433 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003434
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 Py_XDECREF(buffer);
3437 return NULL;
3438}
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440PyObject *
3441PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding,
3443 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003444{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003447 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003448 }
3449
Serhiy Storchaka00939072016-10-27 21:05:49 +03003450 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3451 "PyUnicode_AsDecodedObject() is deprecated; "
3452 "use PyCodec_Decode() to decode from str", 1) < 0)
3453 return NULL;
3454
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003455 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003457
3458 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003459 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003460}
3461
Alexander Belopolsky40018472011-02-26 01:02:56 +00003462PyObject *
3463PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003464 const char *encoding,
3465 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003466{
3467 PyObject *v;
3468
3469 if (!PyUnicode_Check(unicode)) {
3470 PyErr_BadArgument();
3471 goto onError;
3472 }
3473
Serhiy Storchaka00939072016-10-27 21:05:49 +03003474 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3475 "PyUnicode_AsDecodedUnicode() is deprecated; "
3476 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3477 return NULL;
3478
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003479 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003480 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003481
3482 /* Decode via the codec registry */
3483 v = PyCodec_Decode(unicode, encoding, errors);
3484 if (v == NULL)
3485 goto onError;
3486 if (!PyUnicode_Check(v)) {
3487 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003488 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003489 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003490 encoding,
3491 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003492 Py_DECREF(v);
3493 goto onError;
3494 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003495 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003496
Benjamin Peterson29060642009-01-31 22:14:21 +00003497 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003498 return NULL;
3499}
3500
Alexander Belopolsky40018472011-02-26 01:02:56 +00003501PyObject *
3502PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003503 Py_ssize_t size,
3504 const char *encoding,
3505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506{
3507 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003508
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003509 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3513 Py_DECREF(unicode);
3514 return v;
3515}
3516
Alexander Belopolsky40018472011-02-26 01:02:56 +00003517PyObject *
3518PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003519 const char *encoding,
3520 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003521{
3522 PyObject *v;
3523
3524 if (!PyUnicode_Check(unicode)) {
3525 PyErr_BadArgument();
3526 goto onError;
3527 }
3528
Serhiy Storchaka00939072016-10-27 21:05:49 +03003529 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3530 "PyUnicode_AsEncodedObject() is deprecated; "
3531 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3532 "or PyCodec_Encode() for generic encoding", 1) < 0)
3533 return NULL;
3534
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003535 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003537
3538 /* Encode via the codec registry */
3539 v = PyCodec_Encode(unicode, encoding, errors);
3540 if (v == NULL)
3541 goto onError;
3542 return v;
3543
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003545 return NULL;
3546}
3547
Victor Stinner1b579672011-12-17 05:47:23 +01003548
Victor Stinner2cba6b82018-01-10 22:46:15 +01003549static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003550unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003551 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003552{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003553 Py_ssize_t wlen;
3554 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3555 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003556 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003557 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003558
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003559 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003560 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003561 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003562 return NULL;
3563 }
3564
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 char *str;
3566 size_t error_pos;
3567 const char *reason;
3568 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003569 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003570 PyMem_Free(wstr);
3571
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003572 if (res != 0) {
3573 if (res == -2) {
3574 PyObject *exc;
3575 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3576 "locale", unicode,
3577 (Py_ssize_t)error_pos,
3578 (Py_ssize_t)(error_pos+1),
3579 reason);
3580 if (exc != NULL) {
3581 PyCodec_StrictErrors(exc);
3582 Py_DECREF(exc);
3583 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003584 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003585 else if (res == -3) {
3586 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3587 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003588 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003589 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003590 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003591 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003592 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003593
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003594 PyObject *bytes = PyBytes_FromString(str);
3595 PyMem_RawFree(str);
3596 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003597}
3598
Victor Stinnerad158722010-10-27 00:25:46 +00003599PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003600PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3601{
Victor Stinner709d23d2019-05-02 14:56:30 -04003602 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3603 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003604}
3605
3606PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003607PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003608{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003609 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003610#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003611 if (interp->fs_codec.encoding) {
3612 return unicode_encode_utf8(unicode,
3613 interp->fs_codec.error_handler,
3614 interp->fs_codec.errors);
3615 }
3616 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003617 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003618 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003619 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003620 assert(errors != _Py_ERROR_UNKNOWN);
3621 return unicode_encode_utf8(unicode, errors, NULL);
3622 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003623#else
Victor Stinner793b5312011-04-27 00:24:21 +02003624 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3625 cannot use it to encode and decode filenames before it is loaded. Load
3626 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003627 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003628 initialized and the Python codec is loaded.
3629 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003630 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003631 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003632 interp->fs_codec.encoding,
3633 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003634 }
3635 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003636 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003637 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003638 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003639 assert(errors != _Py_ERROR_UNKNOWN);
3640 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003641 }
Victor Stinnerad158722010-10-27 00:25:46 +00003642#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003643}
3644
Alexander Belopolsky40018472011-02-26 01:02:56 +00003645PyObject *
3646PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003647 const char *encoding,
3648 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649{
3650 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003651 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (!PyUnicode_Check(unicode)) {
3654 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 }
Fred Drakee4315f52000-05-09 19:53:39 +00003657
Victor Stinner22eb6892019-06-26 00:51:05 +02003658 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3659 return NULL;
3660 }
3661
Victor Stinner942889a2016-09-05 15:40:10 -07003662 if (encoding == NULL) {
3663 return _PyUnicode_AsUTF8String(unicode, errors);
3664 }
3665
Fred Drakee4315f52000-05-09 19:53:39 +00003666 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003667 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3668 char *lower = buflower;
3669
3670 /* Fast paths */
3671 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3672 lower += 3;
3673 if (*lower == '_') {
3674 /* Match "utf8" and "utf_8" */
3675 lower++;
3676 }
3677
3678 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003679 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003680 }
3681 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3682 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3683 }
3684 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3685 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3686 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003687 }
Victor Stinner942889a2016-09-05 15:40:10 -07003688 else {
3689 if (strcmp(lower, "ascii") == 0
3690 || strcmp(lower, "us_ascii") == 0) {
3691 return _PyUnicode_AsASCIIString(unicode, errors);
3692 }
Steve Dowercc16be82016-09-08 10:35:16 -07003693#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003694 else if (strcmp(lower, "mbcs") == 0) {
3695 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3696 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003697#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003698 else if (strcmp(lower, "latin1") == 0 ||
3699 strcmp(lower, "latin_1") == 0 ||
3700 strcmp(lower, "iso_8859_1") == 0 ||
3701 strcmp(lower, "iso8859_1") == 0) {
3702 return _PyUnicode_AsLatin1String(unicode, errors);
3703 }
3704 }
Victor Stinner37296e82010-06-10 13:36:23 +00003705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706
3707 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003708 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003710 return NULL;
3711
3712 /* The normal path */
3713 if (PyBytes_Check(v))
3714 return v;
3715
3716 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003717 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003718 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003719 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003720
3721 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003722 "encoder %s returned bytearray instead of bytes; "
3723 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003724 encoding);
3725 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003726 Py_DECREF(v);
3727 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003728 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003729
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003730 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3731 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003732 Py_DECREF(v);
3733 return b;
3734 }
3735
3736 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003737 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003738 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003739 encoding,
3740 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003741 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003742 return NULL;
3743}
3744
Alexander Belopolsky40018472011-02-26 01:02:56 +00003745PyObject *
3746PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003747 const char *encoding,
3748 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003749{
3750 PyObject *v;
3751
3752 if (!PyUnicode_Check(unicode)) {
3753 PyErr_BadArgument();
3754 goto onError;
3755 }
3756
Serhiy Storchaka00939072016-10-27 21:05:49 +03003757 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3758 "PyUnicode_AsEncodedUnicode() is deprecated; "
3759 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3760 return NULL;
3761
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003762 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003764
3765 /* Encode via the codec registry */
3766 v = PyCodec_Encode(unicode, encoding, errors);
3767 if (v == NULL)
3768 goto onError;
3769 if (!PyUnicode_Check(v)) {
3770 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003771 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003772 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003773 encoding,
3774 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003775 Py_DECREF(v);
3776 goto onError;
3777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003779
Benjamin Peterson29060642009-01-31 22:14:21 +00003780 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 return NULL;
3782}
3783
Victor Stinner2cba6b82018-01-10 22:46:15 +01003784static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003785unicode_decode_locale(const char *str, Py_ssize_t len,
3786 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003787{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003788 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3789 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003790 return NULL;
3791 }
3792
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003793 wchar_t *wstr;
3794 size_t wlen;
3795 const char *reason;
3796 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003797 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003798 if (res != 0) {
3799 if (res == -2) {
3800 PyObject *exc;
3801 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3802 "locale", str, len,
3803 (Py_ssize_t)wlen,
3804 (Py_ssize_t)(wlen + 1),
3805 reason);
3806 if (exc != NULL) {
3807 PyCodec_StrictErrors(exc);
3808 Py_DECREF(exc);
3809 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003810 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003811 else if (res == -3) {
3812 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3813 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003814 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003815 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003816 }
Victor Stinner2f197072011-12-17 07:08:30 +01003817 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003818 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003819
3820 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3821 PyMem_RawFree(wstr);
3822 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003823}
3824
3825PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003826PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3827 const char *errors)
3828{
Victor Stinner709d23d2019-05-02 14:56:30 -04003829 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3830 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003831}
3832
3833PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003834PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003835{
3836 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003837 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3838 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003839}
3840
3841
3842PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003843PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003844 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003845 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3846}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003847
Christian Heimes5894ba72007-11-04 11:43:14 +00003848PyObject*
3849PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3850{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003851 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003852#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003853 if (interp->fs_codec.encoding) {
3854 return unicode_decode_utf8(s, size,
3855 interp->fs_codec.error_handler,
3856 interp->fs_codec.errors,
3857 NULL);
3858 }
3859 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003860 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003861 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003862 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003863 assert(errors != _Py_ERROR_UNKNOWN);
3864 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3865 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003866#else
Victor Stinner793b5312011-04-27 00:24:21 +02003867 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3868 cannot use it to encode and decode filenames before it is loaded. Load
3869 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003870 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003871 initialized and the Python codec is loaded.
3872 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003873 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003874 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003875 interp->fs_codec.encoding,
3876 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003877 }
3878 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003879 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003880 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003881 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003882 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003883 }
Victor Stinnerad158722010-10-27 00:25:46 +00003884#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003885}
3886
Martin v. Löwis011e8422009-05-05 04:43:17 +00003887
3888int
3889PyUnicode_FSConverter(PyObject* arg, void* addr)
3890{
Brett Cannonec6ce872016-09-06 15:50:29 -07003891 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003892 PyObject *output = NULL;
3893 Py_ssize_t size;
3894 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003895 if (arg == NULL) {
3896 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003897 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003898 return 1;
3899 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003900 path = PyOS_FSPath(arg);
3901 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003902 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003903 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003904 if (PyBytes_Check(path)) {
3905 output = path;
3906 }
3907 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3908 output = PyUnicode_EncodeFSDefault(path);
3909 Py_DECREF(path);
3910 if (!output) {
3911 return 0;
3912 }
3913 assert(PyBytes_Check(output));
3914 }
3915
Victor Stinner0ea2a462010-04-30 00:22:08 +00003916 size = PyBytes_GET_SIZE(output);
3917 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003918 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003920 Py_DECREF(output);
3921 return 0;
3922 }
3923 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003924 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003925}
3926
3927
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003928int
3929PyUnicode_FSDecoder(PyObject* arg, void* addr)
3930{
Brett Cannona5711202016-09-06 19:36:01 -07003931 int is_buffer = 0;
3932 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003933 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003934 if (arg == NULL) {
3935 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003936 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003937 return 1;
3938 }
Brett Cannona5711202016-09-06 19:36:01 -07003939
3940 is_buffer = PyObject_CheckBuffer(arg);
3941 if (!is_buffer) {
3942 path = PyOS_FSPath(arg);
3943 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003944 return 0;
3945 }
Brett Cannona5711202016-09-06 19:36:01 -07003946 }
3947 else {
3948 path = arg;
3949 Py_INCREF(arg);
3950 }
3951
3952 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003953 output = path;
3954 }
3955 else if (PyBytes_Check(path) || is_buffer) {
3956 PyObject *path_bytes = NULL;
3957
3958 if (!PyBytes_Check(path) &&
3959 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003960 "path should be string, bytes, or os.PathLike, not %.200s",
3961 Py_TYPE(arg)->tp_name)) {
3962 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003963 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003964 }
3965 path_bytes = PyBytes_FromObject(path);
3966 Py_DECREF(path);
3967 if (!path_bytes) {
3968 return 0;
3969 }
3970 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3971 PyBytes_GET_SIZE(path_bytes));
3972 Py_DECREF(path_bytes);
3973 if (!output) {
3974 return 0;
3975 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003976 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003977 else {
3978 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003979 "path should be string, bytes, or os.PathLike, not %.200s",
3980 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003981 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003982 return 0;
3983 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003984 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003985 Py_DECREF(output);
3986 return 0;
3987 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003989 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003990 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003991 Py_DECREF(output);
3992 return 0;
3993 }
3994 *(PyObject**)addr = output;
3995 return Py_CLEANUP_SUPPORTED;
3996}
3997
3998
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003999const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001{
Christian Heimesf3863112007-11-22 07:46:41 +00004002 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004004 if (!PyUnicode_Check(unicode)) {
4005 PyErr_BadArgument();
4006 return NULL;
4007 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004008 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004009 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004010
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004011 if (PyUnicode_UTF8(unicode) == NULL) {
4012 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004013 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014 if (bytes == NULL)
4015 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004016 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4017 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004018 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 Py_DECREF(bytes);
4020 return NULL;
4021 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004022 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004023 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004024 PyBytes_AS_STRING(bytes),
4025 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 Py_DECREF(bytes);
4027 }
4028
4029 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004030 *psize = PyUnicode_UTF8_LENGTH(unicode);
4031 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004032}
4033
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004034const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4038}
4039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004040Py_UNICODE *
4041PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043 if (!PyUnicode_Check(unicode)) {
4044 PyErr_BadArgument();
4045 return NULL;
4046 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004047 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4048 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004049 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004050 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004051 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052
Serhiy Storchakac46db922018-10-23 22:58:24 +03004053 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4054 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4055 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004058 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4059 if (w == NULL) {
4060 PyErr_NoMemory();
4061 return NULL;
4062 }
4063 unicode_copy_as_widechar(unicode, w, wlen + 1);
4064 _PyUnicode_WSTR(unicode) = w;
4065 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4066 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004067 }
4068 }
4069 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004070 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004071 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004072}
4073
Alexander Belopolsky40018472011-02-26 01:02:56 +00004074Py_UNICODE *
4075PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078}
4079
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004080const Py_UNICODE *
4081_PyUnicode_AsUnicode(PyObject *unicode)
4082{
4083 Py_ssize_t size;
4084 const Py_UNICODE *wstr;
4085
4086 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4087 if (wstr && wcslen(wstr) != (size_t)size) {
4088 PyErr_SetString(PyExc_ValueError, "embedded null character");
4089 return NULL;
4090 }
4091 return wstr;
4092}
4093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094
Alexander Belopolsky40018472011-02-26 01:02:56 +00004095Py_ssize_t
4096PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097{
4098 if (!PyUnicode_Check(unicode)) {
4099 PyErr_BadArgument();
4100 goto onError;
4101 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004102 if (_PyUnicode_WSTR(unicode) == NULL) {
4103 if (PyUnicode_AsUnicode(unicode) == NULL)
4104 goto onError;
4105 }
4106 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 return -1;
4110}
4111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112Py_ssize_t
4113PyUnicode_GetLength(PyObject *unicode)
4114{
Victor Stinner07621332012-06-16 04:53:46 +02004115 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116 PyErr_BadArgument();
4117 return -1;
4118 }
Victor Stinner07621332012-06-16 04:53:46 +02004119 if (PyUnicode_READY(unicode) == -1)
4120 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004121 return PyUnicode_GET_LENGTH(unicode);
4122}
4123
4124Py_UCS4
4125PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4126{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004127 void *data;
4128 int kind;
4129
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004130 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004131 PyErr_BadArgument();
4132 return (Py_UCS4)-1;
4133 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004134 if (PyUnicode_READY(unicode) == -1) {
4135 return (Py_UCS4)-1;
4136 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004137 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004138 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004139 return (Py_UCS4)-1;
4140 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004141 data = PyUnicode_DATA(unicode);
4142 kind = PyUnicode_KIND(unicode);
4143 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144}
4145
4146int
4147PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4148{
4149 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004150 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151 return -1;
4152 }
Victor Stinner488fa492011-12-12 00:01:39 +01004153 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004154 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004155 PyErr_SetString(PyExc_IndexError, "string index out of range");
4156 return -1;
4157 }
Victor Stinner488fa492011-12-12 00:01:39 +01004158 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004159 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004160 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4161 PyErr_SetString(PyExc_ValueError, "character out of range");
4162 return -1;
4163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004164 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4165 index, ch);
4166 return 0;
4167}
4168
Alexander Belopolsky40018472011-02-26 01:02:56 +00004169const char *
4170PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004171{
Victor Stinner42cb4622010-09-01 19:39:01 +00004172 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004173}
4174
Victor Stinner554f3f02010-06-16 23:33:54 +00004175/* create or adjust a UnicodeDecodeError */
4176static void
4177make_decode_exception(PyObject **exceptionObject,
4178 const char *encoding,
4179 const char *input, Py_ssize_t length,
4180 Py_ssize_t startpos, Py_ssize_t endpos,
4181 const char *reason)
4182{
4183 if (*exceptionObject == NULL) {
4184 *exceptionObject = PyUnicodeDecodeError_Create(
4185 encoding, input, length, startpos, endpos, reason);
4186 }
4187 else {
4188 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4189 goto onError;
4190 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4191 goto onError;
4192 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4193 goto onError;
4194 }
4195 return;
4196
4197onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004198 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004199}
4200
Steve Dowercc16be82016-09-08 10:35:16 -07004201#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004202static int
4203widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4204{
4205 if (newsize > *size) {
4206 wchar_t *newbuf = *buf;
4207 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4208 PyErr_NoMemory();
4209 return -1;
4210 }
4211 *buf = newbuf;
4212 }
4213 *size = newsize;
4214 return 0;
4215}
4216
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217/* error handling callback helper:
4218 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004219 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004220 and adjust various state variables.
4221 return 0 on success, -1 on error
4222*/
4223
Alexander Belopolsky40018472011-02-26 01:02:56 +00004224static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004225unicode_decode_call_errorhandler_wchar(
4226 const char *errors, PyObject **errorHandler,
4227 const char *encoding, const char *reason,
4228 const char **input, const char **inend, Py_ssize_t *startinpos,
4229 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004230 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004232 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004233
4234 PyObject *restuple = NULL;
4235 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004236 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004237 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004238 Py_ssize_t requiredsize;
4239 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004240 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 wchar_t *repwstr;
4242 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004243
4244 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 *errorHandler = PyCodec_LookupError(errors);
4246 if (*errorHandler == NULL)
4247 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004248 }
4249
Victor Stinner554f3f02010-06-16 23:33:54 +00004250 make_decode_exception(exceptionObject,
4251 encoding,
4252 *input, *inend - *input,
4253 *startinpos, *endinpos,
4254 reason);
4255 if (*exceptionObject == NULL)
4256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004257
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004258 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004260 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004262 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004264 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004265 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004267
4268 /* Copy back the bytes variables, which might have been modified by the
4269 callback */
4270 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4271 if (!inputobj)
4272 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 *input = PyBytes_AS_STRING(inputobj);
4274 insize = PyBytes_GET_SIZE(inputobj);
4275 *inend = *input + insize;
4276 /* we can DECREF safely, as the exception has another reference,
4277 so the object won't go away. */
4278 Py_DECREF(inputobj);
4279
4280 if (newpos<0)
4281 newpos = insize+newpos;
4282 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004283 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004284 goto onError;
4285 }
4286
4287 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4288 if (repwstr == NULL)
4289 goto onError;
4290 /* need more space? (at least enough for what we
4291 have+the replacement+the rest of the string (starting
4292 at the new input position), so we won't have to check space
4293 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004294 requiredsize = *outpos;
4295 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4296 goto overflow;
4297 requiredsize += repwlen;
4298 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4299 goto overflow;
4300 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004301 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004303 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004304 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004305 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004307 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004309 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004311 *endinpos = newpos;
4312 *inptr = *input + newpos;
4313
4314 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004315 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 return 0;
4317
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004318 overflow:
4319 PyErr_SetString(PyExc_OverflowError,
4320 "decoded result is too long for a Python string");
4321
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004322 onError:
4323 Py_XDECREF(restuple);
4324 return -1;
4325}
Steve Dowercc16be82016-09-08 10:35:16 -07004326#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004327
4328static int
4329unicode_decode_call_errorhandler_writer(
4330 const char *errors, PyObject **errorHandler,
4331 const char *encoding, const char *reason,
4332 const char **input, const char **inend, Py_ssize_t *startinpos,
4333 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4334 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4335{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004336 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337
4338 PyObject *restuple = NULL;
4339 PyObject *repunicode = NULL;
4340 Py_ssize_t insize;
4341 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004342 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004343 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004345 int need_to_grow = 0;
4346 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004347
4348 if (*errorHandler == NULL) {
4349 *errorHandler = PyCodec_LookupError(errors);
4350 if (*errorHandler == NULL)
4351 goto onError;
4352 }
4353
4354 make_decode_exception(exceptionObject,
4355 encoding,
4356 *input, *inend - *input,
4357 *startinpos, *endinpos,
4358 reason);
4359 if (*exceptionObject == NULL)
4360 goto onError;
4361
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004362 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004363 if (restuple == NULL)
4364 goto onError;
4365 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004366 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367 goto onError;
4368 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004369 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004370 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004371
4372 /* Copy back the bytes variables, which might have been modified by the
4373 callback */
4374 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4375 if (!inputobj)
4376 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004377 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004378 *input = PyBytes_AS_STRING(inputobj);
4379 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004380 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004381 /* we can DECREF safely, as the exception has another reference,
4382 so the object won't go away. */
4383 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004384
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004385 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004387 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004388 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004391
Victor Stinner170ca6f2013-04-18 00:25:28 +02004392 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004393 if (replen > 1) {
4394 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004395 need_to_grow = 1;
4396 }
4397 new_inptr = *input + newpos;
4398 if (*inend - new_inptr > remain) {
4399 /* We don't know the decoding algorithm here so we make the worst
4400 assumption that one byte decodes to one unicode character.
4401 If unfortunately one byte could decode to more unicode characters,
4402 the decoder may write out-of-bound then. Is it possible for the
4403 algorithms using this function? */
4404 writer->min_length += *inend - new_inptr - remain;
4405 need_to_grow = 1;
4406 }
4407 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004408 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004409 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004410 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4411 goto onError;
4412 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004414 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004416 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004417 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004420 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004422
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426}
4427
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004428/* --- UTF-7 Codec -------------------------------------------------------- */
4429
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430/* See RFC2152 for details. We encode conservatively and decode liberally. */
4431
4432/* Three simple macros defining base-64. */
4433
4434/* Is c a base-64 character? */
4435
4436#define IS_BASE64(c) \
4437 (((c) >= 'A' && (c) <= 'Z') || \
4438 ((c) >= 'a' && (c) <= 'z') || \
4439 ((c) >= '0' && (c) <= '9') || \
4440 (c) == '+' || (c) == '/')
4441
4442/* given that c is a base-64 character, what is its base-64 value? */
4443
4444#define FROM_BASE64(c) \
4445 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4446 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4447 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4448 (c) == '+' ? 62 : 63)
4449
4450/* What is the base-64 character of the bottom 6 bits of n? */
4451
4452#define TO_BASE64(n) \
4453 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4454
4455/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4456 * decoded as itself. We are permissive on decoding; the only ASCII
4457 * byte not decoding to itself is the + which begins a base64
4458 * string. */
4459
4460#define DECODE_DIRECT(c) \
4461 ((c) <= 127 && (c) != '+')
4462
4463/* The UTF-7 encoder treats ASCII characters differently according to
4464 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4465 * the above). See RFC2152. This array identifies these different
4466 * sets:
4467 * 0 : "Set D"
4468 * alphanumeric and '(),-./:?
4469 * 1 : "Set O"
4470 * !"#$%&*;<=>@[]^_`{|}
4471 * 2 : "whitespace"
4472 * ht nl cr sp
4473 * 3 : special (must be base64 encoded)
4474 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4475 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476
Tim Petersced69f82003-09-16 20:30:58 +00004477static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478char utf7_category[128] = {
4479/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4480 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4481/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4482 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4483/* sp ! " # $ % & ' ( ) * + , - . / */
4484 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4485/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4486 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4487/* @ A B C D E F G H I J K L M N O */
4488 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4489/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4491/* ` a b c d e f g h i j k l m n o */
4492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4493/* p q r s t u v w x y z { | } ~ del */
4494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495};
4496
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497/* ENCODE_DIRECT: this character should be encoded as itself. The
4498 * answer depends on whether we are encoding set O as itself, and also
4499 * on whether we are encoding whitespace as itself. RFC2152 makes it
4500 * clear that the answers to these questions vary between
4501 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004502
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503#define ENCODE_DIRECT(c, directO, directWS) \
4504 ((c) < 128 && (c) > 0 && \
4505 ((utf7_category[(c)] == 0) || \
4506 (directWS && (utf7_category[(c)] == 2)) || \
4507 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508
Alexander Belopolsky40018472011-02-26 01:02:56 +00004509PyObject *
4510PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004511 Py_ssize_t size,
4512 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004513{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004514 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4515}
4516
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517/* The decoder. The only state we preserve is our read position,
4518 * i.e. how many characters we have consumed. So if we end in the
4519 * middle of a shift sequence we have to back off the read position
4520 * and the output to the beginning of the sequence, otherwise we lose
4521 * all the shift state (seen bits, number of bits seen, high
4522 * surrogate). */
4523
Alexander Belopolsky40018472011-02-26 01:02:56 +00004524PyObject *
4525PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004526 Py_ssize_t size,
4527 const char *errors,
4528 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004531 Py_ssize_t startinpos;
4532 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004533 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 const char *errmsg = "";
4536 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004537 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 unsigned int base64bits = 0;
4539 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004540 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 PyObject *errorHandler = NULL;
4542 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004544 if (size == 0) {
4545 if (consumed)
4546 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004547 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004548 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004549
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004551 _PyUnicodeWriter_Init(&writer);
4552 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004553
4554 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 e = s + size;
4556
4557 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004558 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004560 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 if (inShift) { /* in a base-64 section */
4563 if (IS_BASE64(ch)) { /* consume a base-64 character */
4564 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4565 base64bits += 6;
4566 s++;
4567 if (base64bits >= 16) {
4568 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004569 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 base64bits -= 16;
4571 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004572 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 if (surrogate) {
4574 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004575 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4576 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004577 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004578 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004580 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 }
4582 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004583 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004584 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004586 }
4587 }
Victor Stinner551ac952011-11-29 22:58:13 +01004588 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 /* first surrogate */
4590 surrogate = outCh;
4591 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004593 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004594 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004595 }
4596 }
4597 }
4598 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004599 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 if (base64bits > 0) { /* left-over bits */
4601 if (base64bits >= 6) {
4602 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004603 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 errmsg = "partial character in shift sequence";
4605 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 else {
4608 /* Some bits remain; they should be zero */
4609 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004610 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 errmsg = "non-zero padding bits in shift sequence";
4612 goto utf7Error;
4613 }
4614 }
4615 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004616 if (surrogate && DECODE_DIRECT(ch)) {
4617 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4618 goto onError;
4619 }
4620 surrogate = 0;
4621 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 /* '-' is absorbed; other terminating
4623 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004624 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004625 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004626 }
4627 }
4628 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 s++; /* consume '+' */
4631 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004632 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004633 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004634 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004636 else if (s < e && !IS_BASE64(*s)) {
4637 s++;
4638 errmsg = "ill-formed sequence";
4639 goto utf7Error;
4640 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004641 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004643 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004644 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004646 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 }
4648 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004651 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004654 else {
4655 startinpos = s-starts;
4656 s++;
4657 errmsg = "unexpected special character";
4658 goto utf7Error;
4659 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004660 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004661utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004662 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004663 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 errors, &errorHandler,
4665 "utf7", errmsg,
4666 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004669 }
4670
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671 /* end of string */
4672
4673 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4674 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004675 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004676 if (surrogate ||
4677 (base64bits >= 6) ||
4678 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004681 errors, &errorHandler,
4682 "utf7", "unterminated shift sequence",
4683 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 goto onError;
4686 if (s < e)
4687 goto restart;
4688 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690
4691 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004692 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004694 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004695 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004696 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004697 writer.kind, writer.data, shiftOutStart);
4698 Py_XDECREF(errorHandler);
4699 Py_XDECREF(exc);
4700 _PyUnicodeWriter_Dealloc(&writer);
4701 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004702 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 }
4705 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004706 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004707 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004710 Py_XDECREF(errorHandler);
4711 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713
Benjamin Peterson29060642009-01-31 22:14:21 +00004714 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715 Py_XDECREF(errorHandler);
4716 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004718 return NULL;
4719}
4720
4721
Alexander Belopolsky40018472011-02-26 01:02:56 +00004722PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004723_PyUnicode_EncodeUTF7(PyObject *str,
4724 int base64SetO,
4725 int base64WhiteSpace,
4726 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004727{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004728 int kind;
4729 void *data;
4730 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004731 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004732 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004733 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004734 unsigned int base64bits = 0;
4735 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 char * out;
4737 char * start;
4738
Benjamin Petersonbac79492012-01-14 13:34:47 -05004739 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004740 return NULL;
4741 kind = PyUnicode_KIND(str);
4742 data = PyUnicode_DATA(str);
4743 len = PyUnicode_GET_LENGTH(str);
4744
4745 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004749 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004750 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004751 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004752 if (v == NULL)
4753 return NULL;
4754
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004755 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004756 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004757 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004758
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759 if (inShift) {
4760 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4761 /* shifting out */
4762 if (base64bits) { /* output remaining bits */
4763 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4764 base64buffer = 0;
4765 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766 }
4767 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 /* Characters not in the BASE64 set implicitly unshift the sequence
4769 so no '-' is required, except if the character is itself a '-' */
4770 if (IS_BASE64(ch) || ch == '-') {
4771 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004772 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004773 *out++ = (char) ch;
4774 }
4775 else {
4776 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004777 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 else { /* not in a shift sequence */
4780 if (ch == '+') {
4781 *out++ = '+';
4782 *out++ = '-';
4783 }
4784 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4785 *out++ = (char) ch;
4786 }
4787 else {
4788 *out++ = '+';
4789 inShift = 1;
4790 goto encode_char;
4791 }
4792 }
4793 continue;
4794encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004795 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004796 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004797
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 /* code first surrogate */
4799 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004800 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004801 while (base64bits >= 6) {
4802 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4803 base64bits -= 6;
4804 }
4805 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004806 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004807 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 base64bits += 16;
4809 base64buffer = (base64buffer << 16) | ch;
4810 while (base64bits >= 6) {
4811 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4812 base64bits -= 6;
4813 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004814 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 if (base64bits)
4816 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4817 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004818 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004819 if (_PyBytes_Resize(&v, out - start) < 0)
4820 return NULL;
4821 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004822}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004823PyObject *
4824PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4825 Py_ssize_t size,
4826 int base64SetO,
4827 int base64WhiteSpace,
4828 const char *errors)
4829{
4830 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004831 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004832 if (tmp == NULL)
4833 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004834 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004835 base64WhiteSpace, errors);
4836 Py_DECREF(tmp);
4837 return result;
4838}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004839
Antoine Pitrou244651a2009-05-04 18:56:13 +00004840#undef IS_BASE64
4841#undef FROM_BASE64
4842#undef TO_BASE64
4843#undef DECODE_DIRECT
4844#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846/* --- UTF-8 Codec -------------------------------------------------------- */
4847
Alexander Belopolsky40018472011-02-26 01:02:56 +00004848PyObject *
4849PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004850 Py_ssize_t size,
4851 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852{
Walter Dörwald69652032004-09-07 20:24:22 +00004853 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4854}
4855
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004856#include "stringlib/asciilib.h"
4857#include "stringlib/codecs.h"
4858#include "stringlib/undef.h"
4859
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004860#include "stringlib/ucs1lib.h"
4861#include "stringlib/codecs.h"
4862#include "stringlib/undef.h"
4863
4864#include "stringlib/ucs2lib.h"
4865#include "stringlib/codecs.h"
4866#include "stringlib/undef.h"
4867
4868#include "stringlib/ucs4lib.h"
4869#include "stringlib/codecs.h"
4870#include "stringlib/undef.h"
4871
Antoine Pitrouab868312009-01-10 15:40:25 +00004872/* Mask to quickly check whether a C 'long' contains a
4873 non-ASCII, UTF8-encoded char. */
4874#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004875# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004876#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004877# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004878#else
4879# error C 'long' size should be either 4 or 8!
4880#endif
4881
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882static Py_ssize_t
4883ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004886 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004888 /*
4889 * Issue #17237: m68k is a bit different from most architectures in
4890 * that objects do not use "natural alignment" - for example, int and
4891 * long are only aligned at 2-byte boundaries. Therefore the assert()
4892 * won't work; also, tests have shown that skipping the "optimised
4893 * version" will even speed up m68k.
4894 */
4895#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004897 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4898 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 /* Fast path, see in STRINGLIB(utf8_decode) for
4900 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004901 /* Help allocation */
4902 const char *_p = p;
4903 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904 while (_p < aligned_end) {
4905 unsigned long value = *(const unsigned long *) _p;
4906 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 *((unsigned long *)q) = value;
4909 _p += SIZEOF_LONG;
4910 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004911 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 p = _p;
4913 while (p < end) {
4914 if ((unsigned char)*p & 0x80)
4915 break;
4916 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004921#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 while (p < end) {
4923 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4924 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004925 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004926 /* Help allocation */
4927 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928 while (_p < aligned_end) {
4929 unsigned long value = *(unsigned long *) _p;
4930 if (value & ASCII_CHAR_MASK)
4931 break;
4932 _p += SIZEOF_LONG;
4933 }
4934 p = _p;
4935 if (_p == end)
4936 break;
4937 }
4938 if ((unsigned char)*p & 0x80)
4939 break;
4940 ++p;
4941 }
4942 memcpy(dest, start, p - start);
4943 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944}
Antoine Pitrouab868312009-01-10 15:40:25 +00004945
Victor Stinner709d23d2019-05-02 14:56:30 -04004946static PyObject *
4947unicode_decode_utf8(const char *s, Py_ssize_t size,
4948 _Py_error_handler error_handler, const char *errors,
4949 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004950{
Victor Stinner785938e2011-12-11 20:09:03 +01004951 if (size == 0) {
4952 if (consumed)
4953 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004954 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004955 }
4956
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4958 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004959 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 *consumed = 1;
4961 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004962 }
4963
Inada Naoki770847a2019-06-24 12:30:24 +09004964 const char *starts = s;
4965 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004966
Inada Naoki770847a2019-06-24 12:30:24 +09004967 // fast path: try ASCII string.
4968 PyObject *u = PyUnicode_New(size, 127);
4969 if (u == NULL) {
4970 return NULL;
4971 }
4972 s += ascii_decode(s, end, PyUnicode_DATA(u));
4973 if (s == end) {
4974 return u;
4975 }
4976
4977 // Use _PyUnicodeWriter after fast path is failed.
4978 _PyUnicodeWriter writer;
4979 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4980 writer.pos = s - starts;
4981
4982 Py_ssize_t startinpos, endinpos;
4983 const char *errmsg = "";
4984 PyObject *error_handler_obj = NULL;
4985 PyObject *exc = NULL;
4986
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004987 while (s < end) {
4988 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004989 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004990
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004992 if (PyUnicode_IS_ASCII(writer.buffer))
4993 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004994 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 } else {
4999 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005001 }
5002
5003 switch (ch) {
5004 case 0:
5005 if (s == end || consumed)
5006 goto End;
5007 errmsg = "unexpected end of data";
5008 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005009 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 break;
5011 case 1:
5012 errmsg = "invalid start byte";
5013 startinpos = s - starts;
5014 endinpos = startinpos + 1;
5015 break;
5016 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005017 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5018 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5019 {
5020 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005021 goto End;
5022 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005023 /* fall through */
5024 case 3:
5025 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005026 errmsg = "invalid continuation byte";
5027 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005028 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005029 break;
5030 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005031 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005032 goto onError;
5033 continue;
5034 }
5035
Victor Stinner1d65d912015-10-05 13:43:50 +02005036 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005037 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005038
5039 switch (error_handler) {
5040 case _Py_ERROR_IGNORE:
5041 s += (endinpos - startinpos);
5042 break;
5043
5044 case _Py_ERROR_REPLACE:
5045 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5046 goto onError;
5047 s += (endinpos - startinpos);
5048 break;
5049
5050 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005051 {
5052 Py_ssize_t i;
5053
Victor Stinner1d65d912015-10-05 13:43:50 +02005054 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5055 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005056 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005057 ch = (Py_UCS4)(unsigned char)(starts[i]);
5058 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5059 ch + 0xdc00);
5060 writer.pos++;
5061 }
5062 s += (endinpos - startinpos);
5063 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005064 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005065
5066 default:
5067 if (unicode_decode_call_errorhandler_writer(
5068 errors, &error_handler_obj,
5069 "utf-8", errmsg,
5070 &starts, &end, &startinpos, &endinpos, &exc, &s,
5071 &writer))
5072 goto onError;
5073 }
Victor Stinner785938e2011-12-11 20:09:03 +01005074 }
5075
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005076End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 if (consumed)
5078 *consumed = s - starts;
5079
Victor Stinner1d65d912015-10-05 13:43:50 +02005080 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005082 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083
5084onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005085 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005087 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005088 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005089}
5090
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005091
Victor Stinner709d23d2019-05-02 14:56:30 -04005092PyObject *
5093PyUnicode_DecodeUTF8Stateful(const char *s,
5094 Py_ssize_t size,
5095 const char *errors,
5096 Py_ssize_t *consumed)
5097{
5098 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5099}
5100
5101
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5103 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005104
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005105 On success, write a pointer to a newly allocated wide character string into
5106 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5107 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005108
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005109 On memory allocation failure, return -1.
5110
5111 On decoding error (if surrogateescape is zero), return -2. If wlen is
5112 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5113 is not NULL, write the decoding error message into *reason. */
5114int
5115_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005116 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005117{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005118 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005119 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 wchar_t *unicode;
5121 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005122
Victor Stinner3d4226a2018-08-29 22:21:32 +02005123 int surrogateescape = 0;
5124 int surrogatepass = 0;
5125 switch (errors)
5126 {
5127 case _Py_ERROR_STRICT:
5128 break;
5129 case _Py_ERROR_SURROGATEESCAPE:
5130 surrogateescape = 1;
5131 break;
5132 case _Py_ERROR_SURROGATEPASS:
5133 surrogatepass = 1;
5134 break;
5135 default:
5136 return -3;
5137 }
5138
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005139 /* Note: size will always be longer than the resulting Unicode
5140 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005141 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005143 }
5144
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005145 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005146 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005147 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005148 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005149
5150 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005151 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005152 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005153 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005154 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005155#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005157#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005160 if (ch > 0xFF) {
5161#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005162 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005163#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005164 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005165 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005166 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5167 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5168#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005169 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005170 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005171 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005172 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005173 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005174
5175 if (surrogateescape) {
5176 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5177 }
5178 else {
5179 /* Is it a valid three-byte code? */
5180 if (surrogatepass
5181 && (e - s) >= 3
5182 && (s[0] & 0xf0) == 0xe0
5183 && (s[1] & 0xc0) == 0x80
5184 && (s[2] & 0xc0) == 0x80)
5185 {
5186 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5187 s += 3;
5188 unicode[outpos++] = ch;
5189 }
5190 else {
5191 PyMem_RawFree(unicode );
5192 if (reason != NULL) {
5193 switch (ch) {
5194 case 0:
5195 *reason = "unexpected end of data";
5196 break;
5197 case 1:
5198 *reason = "invalid start byte";
5199 break;
5200 /* 2, 3, 4 */
5201 default:
5202 *reason = "invalid continuation byte";
5203 break;
5204 }
5205 }
5206 if (wlen != NULL) {
5207 *wlen = s - orig_s;
5208 }
5209 return -2;
5210 }
5211 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005213 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005214 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005215 if (wlen) {
5216 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005217 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218 *wstr = unicode;
5219 return 0;
5220}
5221
Victor Stinner5f9cf232019-03-19 01:46:25 +01005222
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005223wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005224_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5225 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005226{
5227 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005228 int res = _Py_DecodeUTF8Ex(arg, arglen,
5229 &wstr, wlen,
5230 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005231 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005232 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5233 assert(res != -3);
5234 if (wlen) {
5235 *wlen = (size_t)res;
5236 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005237 return NULL;
5238 }
5239 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005240}
5241
Antoine Pitrouab868312009-01-10 15:40:25 +00005242
Victor Stinnere47e6982017-12-21 15:45:16 +01005243/* UTF-8 encoder using the surrogateescape error handler .
5244
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 On success, return 0 and write the newly allocated character string (use
5246 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005247
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005248 On encoding failure, return -2 and write the position of the invalid
5249 surrogate character into *error_pos (if error_pos is set) and the decoding
5250 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005251
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005252 On memory allocation failure, return -1. */
5253int
5254_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005256{
5257 const Py_ssize_t max_char_size = 4;
5258 Py_ssize_t len = wcslen(text);
5259
5260 assert(len >= 0);
5261
Victor Stinner3d4226a2018-08-29 22:21:32 +02005262 int surrogateescape = 0;
5263 int surrogatepass = 0;
5264 switch (errors)
5265 {
5266 case _Py_ERROR_STRICT:
5267 break;
5268 case _Py_ERROR_SURROGATEESCAPE:
5269 surrogateescape = 1;
5270 break;
5271 case _Py_ERROR_SURROGATEPASS:
5272 surrogatepass = 1;
5273 break;
5274 default:
5275 return -3;
5276 }
5277
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005278 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5279 return -1;
5280 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005281 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 if (raw_malloc) {
5283 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005284 }
5285 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005286 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005287 }
5288 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005290 }
5291
5292 char *p = bytes;
5293 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005294 for (i = 0; i < len; ) {
5295 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005297 i++;
5298#if Py_UNICODE_SIZE == 2
5299 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5300 && i < len
5301 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5302 {
5303 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5304 i++;
5305 }
5306#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005307
5308 if (ch < 0x80) {
5309 /* Encode ASCII */
5310 *p++ = (char) ch;
5311
5312 }
5313 else if (ch < 0x0800) {
5314 /* Encode Latin-1 */
5315 *p++ = (char)(0xc0 | (ch >> 6));
5316 *p++ = (char)(0x80 | (ch & 0x3f));
5317 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005318 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005319 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005321 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005322 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005323 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005324 if (reason != NULL) {
5325 *reason = "encoding error";
5326 }
5327 if (raw_malloc) {
5328 PyMem_RawFree(bytes);
5329 }
5330 else {
5331 PyMem_Free(bytes);
5332 }
5333 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005334 }
5335 *p++ = (char)(ch & 0xff);
5336 }
5337 else if (ch < 0x10000) {
5338 *p++ = (char)(0xe0 | (ch >> 12));
5339 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5340 *p++ = (char)(0x80 | (ch & 0x3f));
5341 }
5342 else { /* ch >= 0x10000 */
5343 assert(ch <= MAX_UNICODE);
5344 /* Encode UCS4 Unicode ordinals */
5345 *p++ = (char)(0xf0 | (ch >> 18));
5346 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5347 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5348 *p++ = (char)(0x80 | (ch & 0x3f));
5349 }
5350 }
5351 *p++ = '\0';
5352
5353 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005354 char *bytes2;
5355 if (raw_malloc) {
5356 bytes2 = PyMem_RawRealloc(bytes, final_size);
5357 }
5358 else {
5359 bytes2 = PyMem_Realloc(bytes, final_size);
5360 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005361 if (bytes2 == NULL) {
5362 if (error_pos != NULL) {
5363 *error_pos = (size_t)-1;
5364 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005365 if (raw_malloc) {
5366 PyMem_RawFree(bytes);
5367 }
5368 else {
5369 PyMem_Free(bytes);
5370 }
5371 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005372 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005373 *str = bytes2;
5374 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005375}
5376
5377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378/* Primary internal function which creates utf8 encoded bytes objects.
5379
5380 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005381 and allocate exactly as much space needed at the end. Else allocate the
5382 maximum possible needed (4 result bytes per Unicode character), and return
5383 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005384*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005385static PyObject *
5386unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388{
Victor Stinner6099a032011-12-18 14:22:26 +01005389 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005390 void *data;
5391 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393 if (!PyUnicode_Check(unicode)) {
5394 PyErr_BadArgument();
5395 return NULL;
5396 }
5397
5398 if (PyUnicode_READY(unicode) == -1)
5399 return NULL;
5400
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005401 if (PyUnicode_UTF8(unicode))
5402 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5403 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404
5405 kind = PyUnicode_KIND(unicode);
5406 data = PyUnicode_DATA(unicode);
5407 size = PyUnicode_GET_LENGTH(unicode);
5408
Benjamin Petersonead6b532011-12-20 17:23:42 -06005409 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005410 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005411 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005412 case PyUnicode_1BYTE_KIND:
5413 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5414 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005415 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005416 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005417 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005418 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005419 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421}
5422
Alexander Belopolsky40018472011-02-26 01:02:56 +00005423PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005424_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5425{
5426 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5427}
5428
5429
5430PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5432 Py_ssize_t size,
5433 const char *errors)
5434{
5435 PyObject *v, *unicode;
5436
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005437 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005438 if (unicode == NULL)
5439 return NULL;
5440 v = _PyUnicode_AsUTF8String(unicode, errors);
5441 Py_DECREF(unicode);
5442 return v;
5443}
5444
5445PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005446PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005448 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449}
5450
Walter Dörwald41980ca2007-08-16 21:55:45 +00005451/* --- UTF-32 Codec ------------------------------------------------------- */
5452
5453PyObject *
5454PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 Py_ssize_t size,
5456 const char *errors,
5457 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005458{
5459 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5460}
5461
5462PyObject *
5463PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 Py_ssize_t size,
5465 const char *errors,
5466 int *byteorder,
5467 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005468{
5469 const char *starts = s;
5470 Py_ssize_t startinpos;
5471 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005472 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005473 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005474 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005475 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005476 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005477 PyObject *errorHandler = NULL;
5478 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005479
Walter Dörwald41980ca2007-08-16 21:55:45 +00005480 q = (unsigned char *)s;
5481 e = q + size;
5482
5483 if (byteorder)
5484 bo = *byteorder;
5485
5486 /* Check for BOM marks (U+FEFF) in the input and adjust current
5487 byte order setting accordingly. In native mode, the leading BOM
5488 mark is skipped, in all other modes, it is copied to the output
5489 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005490 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005491 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005492 if (bom == 0x0000FEFF) {
5493 bo = -1;
5494 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005496 else if (bom == 0xFFFE0000) {
5497 bo = 1;
5498 q += 4;
5499 }
5500 if (byteorder)
5501 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005502 }
5503
Victor Stinnere64322e2012-10-30 23:12:47 +01005504 if (q == e) {
5505 if (consumed)
5506 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005507 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005508 }
5509
Victor Stinnere64322e2012-10-30 23:12:47 +01005510#ifdef WORDS_BIGENDIAN
5511 le = bo < 0;
5512#else
5513 le = bo <= 0;
5514#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005515 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005516
Victor Stinner8f674cc2013-04-17 23:02:17 +02005517 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005518 writer.min_length = (e - q + 3) / 4;
5519 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005521
Victor Stinnere64322e2012-10-30 23:12:47 +01005522 while (1) {
5523 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005524 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005525
Victor Stinnere64322e2012-10-30 23:12:47 +01005526 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005527 enum PyUnicode_Kind kind = writer.kind;
5528 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005529 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005530 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005531 if (le) {
5532 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005533 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005534 if (ch > maxch)
5535 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005536 if (kind != PyUnicode_1BYTE_KIND &&
5537 Py_UNICODE_IS_SURROGATE(ch))
5538 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005539 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005540 q += 4;
5541 } while (q <= last);
5542 }
5543 else {
5544 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005545 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005546 if (ch > maxch)
5547 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005548 if (kind != PyUnicode_1BYTE_KIND &&
5549 Py_UNICODE_IS_SURROGATE(ch))
5550 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005551 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005552 q += 4;
5553 } while (q <= last);
5554 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005555 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005556 }
5557
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005558 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005559 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005560 startinpos = ((const char *)q) - starts;
5561 endinpos = startinpos + 4;
5562 }
5563 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005564 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005566 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005568 startinpos = ((const char *)q) - starts;
5569 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005571 else {
5572 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005573 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 goto onError;
5575 q += 4;
5576 continue;
5577 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005578 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005579 startinpos = ((const char *)q) - starts;
5580 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005582
5583 /* The remaining input chars are ignored if the callback
5584 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005585 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005587 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005591 }
5592
Walter Dörwald41980ca2007-08-16 21:55:45 +00005593 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595
Walter Dörwald41980ca2007-08-16 21:55:45 +00005596 Py_XDECREF(errorHandler);
5597 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005598 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005599
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005601 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005602 Py_XDECREF(errorHandler);
5603 Py_XDECREF(exc);
5604 return NULL;
5605}
5606
5607PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005608_PyUnicode_EncodeUTF32(PyObject *str,
5609 const char *errors,
5610 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005611{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005612 enum PyUnicode_Kind kind;
5613 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005614 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005615 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005616 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005617#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005618 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005619#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005620 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005621#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005622 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005623 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005624 PyObject *errorHandler = NULL;
5625 PyObject *exc = NULL;
5626 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005627
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005628 if (!PyUnicode_Check(str)) {
5629 PyErr_BadArgument();
5630 return NULL;
5631 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005632 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 return NULL;
5634 kind = PyUnicode_KIND(str);
5635 data = PyUnicode_DATA(str);
5636 len = PyUnicode_GET_LENGTH(str);
5637
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005638 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005639 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005640 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005641 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005642 if (v == NULL)
5643 return NULL;
5644
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005645 /* output buffer is 4-bytes aligned */
5646 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005647 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005648 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005649 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005650 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005651 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005652
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005653 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005654 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005655 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005656 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005657 else
5658 encoding = "utf-32";
5659
5660 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5662 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005663 }
5664
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005665 pos = 0;
5666 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005667 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005668
5669 if (kind == PyUnicode_2BYTE_KIND) {
5670 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5671 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005672 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005673 else {
5674 assert(kind == PyUnicode_4BYTE_KIND);
5675 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5676 &out, native_ordering);
5677 }
5678 if (pos == len)
5679 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005680
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005681 rep = unicode_encode_call_errorhandler(
5682 errors, &errorHandler,
5683 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005684 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005685 if (!rep)
5686 goto error;
5687
5688 if (PyBytes_Check(rep)) {
5689 repsize = PyBytes_GET_SIZE(rep);
5690 if (repsize & 3) {
5691 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005692 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005693 "surrogates not allowed");
5694 goto error;
5695 }
5696 moreunits = repsize / 4;
5697 }
5698 else {
5699 assert(PyUnicode_Check(rep));
5700 if (PyUnicode_READY(rep) < 0)
5701 goto error;
5702 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5703 if (!PyUnicode_IS_ASCII(rep)) {
5704 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005705 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005706 "surrogates not allowed");
5707 goto error;
5708 }
5709 }
5710
5711 /* four bytes are reserved for each surrogate */
5712 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005713 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005714 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005715 /* integer overflow */
5716 PyErr_NoMemory();
5717 goto error;
5718 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005719 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005720 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005721 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 }
5723
5724 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005725 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005726 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005727 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005729 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5730 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 }
5732
5733 Py_CLEAR(rep);
5734 }
5735
5736 /* Cut back to size actually needed. This is necessary for, for example,
5737 encoding of a string containing isolated surrogates and the 'ignore'
5738 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005739 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005740 if (nsize != PyBytes_GET_SIZE(v))
5741 _PyBytes_Resize(&v, nsize);
5742 Py_XDECREF(errorHandler);
5743 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005744 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005745 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005746 error:
5747 Py_XDECREF(rep);
5748 Py_XDECREF(errorHandler);
5749 Py_XDECREF(exc);
5750 Py_XDECREF(v);
5751 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005752}
5753
Alexander Belopolsky40018472011-02-26 01:02:56 +00005754PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5756 Py_ssize_t size,
5757 const char *errors,
5758 int byteorder)
5759{
5760 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005761 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005762 if (tmp == NULL)
5763 return NULL;
5764 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5765 Py_DECREF(tmp);
5766 return result;
5767}
5768
5769PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005770PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005771{
Victor Stinnerb960b342011-11-20 19:12:52 +01005772 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005773}
5774
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775/* --- UTF-16 Codec ------------------------------------------------------- */
5776
Tim Peters772747b2001-08-09 22:21:55 +00005777PyObject *
5778PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 Py_ssize_t size,
5780 const char *errors,
5781 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782{
Walter Dörwald69652032004-09-07 20:24:22 +00005783 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5784}
5785
5786PyObject *
5787PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 Py_ssize_t size,
5789 const char *errors,
5790 int *byteorder,
5791 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005794 Py_ssize_t startinpos;
5795 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005796 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005797 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005798 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005800 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 PyObject *errorHandler = NULL;
5802 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005803 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804
Tim Peters772747b2001-08-09 22:21:55 +00005805 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005806 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
5808 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005809 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005811 /* Check for BOM marks (U+FEFF) in the input and adjust current
5812 byte order setting accordingly. In native mode, the leading BOM
5813 mark is skipped, in all other modes, it is copied to the output
5814 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005815 if (bo == 0 && size >= 2) {
5816 const Py_UCS4 bom = (q[1] << 8) | q[0];
5817 if (bom == 0xFEFF) {
5818 q += 2;
5819 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005821 else if (bom == 0xFFFE) {
5822 q += 2;
5823 bo = 1;
5824 }
5825 if (byteorder)
5826 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829 if (q == e) {
5830 if (consumed)
5831 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005832 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005833 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005834
Christian Heimes743e0cd2012-10-17 23:52:17 +02005835#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005836 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005837 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005838#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005839 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005840 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005841#endif
Tim Peters772747b2001-08-09 22:21:55 +00005842
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005844 character count normally. Error handler will take care of
5845 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005846 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005847 writer.min_length = (e - q + 1) / 2;
5848 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005850
Antoine Pitrou63065d72012-05-15 23:48:04 +02005851 while (1) {
5852 Py_UCS4 ch = 0;
5853 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005854 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005855 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005856 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005857 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005859 native_ordering);
5860 else
5861 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005863 native_ordering);
5864 } else if (kind == PyUnicode_2BYTE_KIND) {
5865 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005867 native_ordering);
5868 } else {
5869 assert(kind == PyUnicode_4BYTE_KIND);
5870 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005871 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005872 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005873 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005874 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875
Antoine Pitrou63065d72012-05-15 23:48:04 +02005876 switch (ch)
5877 {
5878 case 0:
5879 /* remaining byte at the end? (size should be even) */
5880 if (q == e || consumed)
5881 goto End;
5882 errmsg = "truncated data";
5883 startinpos = ((const char *)q) - starts;
5884 endinpos = ((const char *)e) - starts;
5885 break;
5886 /* The remaining input chars are ignored if the callback
5887 chooses to skip the input */
5888 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005889 q -= 2;
5890 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005891 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005892 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005893 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005894 endinpos = ((const char *)e) - starts;
5895 break;
5896 case 2:
5897 errmsg = "illegal encoding";
5898 startinpos = ((const char *)q) - 2 - starts;
5899 endinpos = startinpos + 2;
5900 break;
5901 case 3:
5902 errmsg = "illegal UTF-16 surrogate";
5903 startinpos = ((const char *)q) - 4 - starts;
5904 endinpos = startinpos + 2;
5905 break;
5906 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005907 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005908 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 continue;
5910 }
5911
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005912 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005913 errors,
5914 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005916 &starts,
5917 (const char **)&e,
5918 &startinpos,
5919 &endinpos,
5920 &exc,
5921 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005922 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 }
5925
Antoine Pitrou63065d72012-05-15 23:48:04 +02005926End:
Walter Dörwald69652032004-09-07 20:24:22 +00005927 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 Py_XDECREF(errorHandler);
5931 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005932 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005935 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 Py_XDECREF(errorHandler);
5937 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 return NULL;
5939}
5940
Tim Peters772747b2001-08-09 22:21:55 +00005941PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005942_PyUnicode_EncodeUTF16(PyObject *str,
5943 const char *errors,
5944 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005946 enum PyUnicode_Kind kind;
5947 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005950 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005951 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005952#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005953 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005954#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005955 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005956#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005957 const char *encoding;
5958 Py_ssize_t nsize, pos;
5959 PyObject *errorHandler = NULL;
5960 PyObject *exc = NULL;
5961 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005962
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005963 if (!PyUnicode_Check(str)) {
5964 PyErr_BadArgument();
5965 return NULL;
5966 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005967 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005968 return NULL;
5969 kind = PyUnicode_KIND(str);
5970 data = PyUnicode_DATA(str);
5971 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005972
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005973 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005974 if (kind == PyUnicode_4BYTE_KIND) {
5975 const Py_UCS4 *in = (const Py_UCS4 *)data;
5976 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005977 while (in < end) {
5978 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005980 }
5981 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005982 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005983 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005985 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005986 nsize = len + pairs + (byteorder == 0);
5987 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005988 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005992 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005993 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005994 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005995 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005996 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005997 }
5998 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005999 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006000 }
Tim Peters772747b2001-08-09 22:21:55 +00006001
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006002 if (kind == PyUnicode_1BYTE_KIND) {
6003 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6004 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006005 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006006
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006007 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006008 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006009 }
6010 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006011 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006012 }
6013 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006014 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006015 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006016
6017 pos = 0;
6018 while (pos < len) {
6019 Py_ssize_t repsize, moreunits;
6020
6021 if (kind == PyUnicode_2BYTE_KIND) {
6022 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6023 &out, native_ordering);
6024 }
6025 else {
6026 assert(kind == PyUnicode_4BYTE_KIND);
6027 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6028 &out, native_ordering);
6029 }
6030 if (pos == len)
6031 break;
6032
6033 rep = unicode_encode_call_errorhandler(
6034 errors, &errorHandler,
6035 encoding, "surrogates not allowed",
6036 str, &exc, pos, pos + 1, &pos);
6037 if (!rep)
6038 goto error;
6039
6040 if (PyBytes_Check(rep)) {
6041 repsize = PyBytes_GET_SIZE(rep);
6042 if (repsize & 1) {
6043 raise_encode_exception(&exc, encoding,
6044 str, pos - 1, pos,
6045 "surrogates not allowed");
6046 goto error;
6047 }
6048 moreunits = repsize / 2;
6049 }
6050 else {
6051 assert(PyUnicode_Check(rep));
6052 if (PyUnicode_READY(rep) < 0)
6053 goto error;
6054 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6055 if (!PyUnicode_IS_ASCII(rep)) {
6056 raise_encode_exception(&exc, encoding,
6057 str, pos - 1, pos,
6058 "surrogates not allowed");
6059 goto error;
6060 }
6061 }
6062
6063 /* two bytes are reserved for each surrogate */
6064 if (moreunits > 1) {
6065 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006066 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006067 /* integer overflow */
6068 PyErr_NoMemory();
6069 goto error;
6070 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006071 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006072 goto error;
6073 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6074 }
6075
6076 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006077 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006078 out += moreunits;
6079 } else /* rep is unicode */ {
6080 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6081 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6082 &out, native_ordering);
6083 }
6084
6085 Py_CLEAR(rep);
6086 }
6087
6088 /* Cut back to size actually needed. This is necessary for, for example,
6089 encoding of a string containing isolated surrogates and the 'ignore' handler
6090 is used. */
6091 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6092 if (nsize != PyBytes_GET_SIZE(v))
6093 _PyBytes_Resize(&v, nsize);
6094 Py_XDECREF(errorHandler);
6095 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006096 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006097 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006098 error:
6099 Py_XDECREF(rep);
6100 Py_XDECREF(errorHandler);
6101 Py_XDECREF(exc);
6102 Py_XDECREF(v);
6103 return NULL;
6104#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105}
6106
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6109 Py_ssize_t size,
6110 const char *errors,
6111 int byteorder)
6112{
6113 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006114 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006115 if (tmp == NULL)
6116 return NULL;
6117 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6118 Py_DECREF(tmp);
6119 return result;
6120}
6121
6122PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006123PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126}
6127
6128/* --- Unicode Escape Codec ----------------------------------------------- */
6129
Fredrik Lundh06d12682001-01-24 07:59:11 +00006130static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006131
Alexander Belopolsky40018472011-02-26 01:02:56 +00006132PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006133_PyUnicode_DecodeUnicodeEscape(const char *s,
6134 Py_ssize_t size,
6135 const char *errors,
6136 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006139 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 PyObject *errorHandler = NULL;
6142 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006143
Eric V. Smith42454af2016-10-31 09:22:08 -04006144 // so we can remember if we've seen an invalid escape char or not
6145 *first_invalid_escape = NULL;
6146
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006148 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 }
6150 /* Escaped strings will always be longer than the resulting
6151 Unicode string, so we start with size here and then reduce the
6152 length after conversion to the true value.
6153 (but if the error callback returns a long replacement string
6154 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006155 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 writer.min_length = size;
6157 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6158 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006159 }
6160
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 end = s + size;
6162 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006163 unsigned char c = (unsigned char) *s++;
6164 Py_UCS4 ch;
6165 int count;
6166 Py_ssize_t startinpos;
6167 Py_ssize_t endinpos;
6168 const char *message;
6169
6170#define WRITE_ASCII_CHAR(ch) \
6171 do { \
6172 assert(ch <= 127); \
6173 assert(writer.pos < writer.size); \
6174 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6175 } while(0)
6176
6177#define WRITE_CHAR(ch) \
6178 do { \
6179 if (ch <= writer.maxchar) { \
6180 assert(writer.pos < writer.size); \
6181 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6182 } \
6183 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6184 goto onError; \
6185 } \
6186 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 if (c != '\\') {
6190 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 continue;
6192 }
6193
Victor Stinner62ec3312016-09-06 17:04:34 -07006194 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006196 if (s >= end) {
6197 message = "\\ at end of string";
6198 goto error;
6199 }
6200 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006201
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006203 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006204
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 case '\n': continue;
6207 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6208 case '\'': WRITE_ASCII_CHAR('\''); continue;
6209 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6210 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006211 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006212 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6213 case 't': WRITE_ASCII_CHAR('\t'); continue;
6214 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6215 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006216 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006218 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006219 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 case '0': case '1': case '2': case '3':
6223 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006225 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 ch = (ch<<3) + *s++ - '0';
6227 if (s < end && '0' <= *s && *s <= '7') {
6228 ch = (ch<<3) + *s++ - '0';
6229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006231 WRITE_CHAR(ch);
6232 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* hex escapes */
6235 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006237 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006238 message = "truncated \\xXX escape";
6239 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006244 message = "truncated \\uXXXX escape";
6245 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006248 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006249 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006250 message = "truncated \\UXXXXXXXX escape";
6251 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006252 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006253 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 ch <<= 4;
6255 if (c >= '0' && c <= '9') {
6256 ch += c - '0';
6257 }
6258 else if (c >= 'a' && c <= 'f') {
6259 ch += c - ('a' - 10);
6260 }
6261 else if (c >= 'A' && c <= 'F') {
6262 ch += c - ('A' - 10);
6263 }
6264 else {
6265 break;
6266 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006267 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006269 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 }
6271
6272 /* when we get here, ch is a 32-bit unicode character */
6273 if (ch > MAX_UNICODE) {
6274 message = "illegal Unicode character";
6275 goto error;
6276 }
6277
6278 WRITE_CHAR(ch);
6279 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006280
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006282 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006283 if (ucnhash_CAPI == NULL) {
6284 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006285 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6286 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006287 if (ucnhash_CAPI == NULL) {
6288 PyErr_SetString(
6289 PyExc_UnicodeError,
6290 "\\N escapes not supported (can't load unicodedata module)"
6291 );
6292 goto onError;
6293 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006294 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006295
6296 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006297 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006298 const char *start = ++s;
6299 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006300 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006301 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006302 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 namelen = s - start;
6304 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006305 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006306 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 ch = 0xffffffff; /* in case 'getcode' messes up */
6308 if (namelen <= INT_MAX &&
6309 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6310 &ch, 0)) {
6311 assert(ch <= MAX_UNICODE);
6312 WRITE_CHAR(ch);
6313 continue;
6314 }
6315 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006316 }
6317 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006318 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006319
6320 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006321 if (*first_invalid_escape == NULL) {
6322 *first_invalid_escape = s-1; /* Back up one char, since we've
6323 already incremented s. */
6324 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 WRITE_ASCII_CHAR('\\');
6326 WRITE_CHAR(c);
6327 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006329
6330 error:
6331 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006333 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006334 errors, &errorHandler,
6335 "unicodeescape", message,
6336 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006337 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006338 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006340 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006341
6342#undef WRITE_ASCII_CHAR
6343#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006345
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006346 Py_XDECREF(errorHandler);
6347 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006348 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006349
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006351 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 Py_XDECREF(errorHandler);
6353 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354 return NULL;
6355}
6356
Eric V. Smith42454af2016-10-31 09:22:08 -04006357PyObject *
6358PyUnicode_DecodeUnicodeEscape(const char *s,
6359 Py_ssize_t size,
6360 const char *errors)
6361{
6362 const char *first_invalid_escape;
6363 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6364 &first_invalid_escape);
6365 if (result == NULL)
6366 return NULL;
6367 if (first_invalid_escape != NULL) {
6368 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6369 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006370 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006371 Py_DECREF(result);
6372 return NULL;
6373 }
6374 }
6375 return result;
6376}
6377
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006378/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379
Alexander Belopolsky40018472011-02-26 01:02:56 +00006380PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006381PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006382{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006383 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006384 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006386 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006387 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389
Ezio Melottie7f90372012-10-05 03:33:31 +03006390 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006391 escape.
6392
Ezio Melottie7f90372012-10-05 03:33:31 +03006393 For UCS1 strings it's '\xxx', 4 bytes per source character.
6394 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6395 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006396 */
6397
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006398 if (!PyUnicode_Check(unicode)) {
6399 PyErr_BadArgument();
6400 return NULL;
6401 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 }
Victor Stinner358af132015-10-12 22:36:57 +02006405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 if (len == 0) {
6408 return PyBytes_FromStringAndSize(NULL, 0);
6409 }
6410
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 kind = PyUnicode_KIND(unicode);
6412 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006413 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6414 bytes, and 1 byte characters 4. */
6415 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006416 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 return PyErr_NoMemory();
6418 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006419 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 if (repr == NULL) {
6421 return NULL;
6422 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006423
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006426 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006427
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 /* U+0000-U+00ff range */
6429 if (ch < 0x100) {
6430 if (ch >= ' ' && ch < 127) {
6431 if (ch != '\\') {
6432 /* Copy printable US ASCII as-is */
6433 *p++ = (char) ch;
6434 }
6435 /* Escape backslashes */
6436 else {
6437 *p++ = '\\';
6438 *p++ = '\\';
6439 }
6440 }
Victor Stinner358af132015-10-12 22:36:57 +02006441
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 /* Map special whitespace to '\t', \n', '\r' */
6443 else if (ch == '\t') {
6444 *p++ = '\\';
6445 *p++ = 't';
6446 }
6447 else if (ch == '\n') {
6448 *p++ = '\\';
6449 *p++ = 'n';
6450 }
6451 else if (ch == '\r') {
6452 *p++ = '\\';
6453 *p++ = 'r';
6454 }
6455
6456 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6457 else {
6458 *p++ = '\\';
6459 *p++ = 'x';
6460 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6461 *p++ = Py_hexdigits[ch & 0x000F];
6462 }
Tim Petersced69f82003-09-16 20:30:58 +00006463 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006464 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 *p++ = '\\';
6467 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006468 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6469 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6470 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6471 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006473 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6474 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006475
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 /* Make sure that the first two digits are zero */
6477 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006478 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006479 *p++ = 'U';
6480 *p++ = '0';
6481 *p++ = '0';
6482 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6483 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6484 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6485 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6486 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6487 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 assert(p - PyBytes_AS_STRING(repr) > 0);
6492 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6493 return NULL;
6494 }
6495 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Alexander Belopolsky40018472011-02-26 01:02:56 +00006498PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006499PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6500 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006502 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006503 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 }
6507
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006508 result = PyUnicode_AsUnicodeEscapeString(tmp);
6509 Py_DECREF(tmp);
6510 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511}
6512
6513/* --- Raw Unicode Escape Codec ------------------------------------------- */
6514
Alexander Belopolsky40018472011-02-26 01:02:56 +00006515PyObject *
6516PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006517 Py_ssize_t size,
6518 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006520 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006521 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 PyObject *errorHandler = NULL;
6524 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006525
Victor Stinner62ec3312016-09-06 17:04:34 -07006526 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006527 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006528 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 /* Escaped strings will always be longer than the resulting
6531 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532 length after conversion to the true value. (But decoding error
6533 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006534 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006535 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006536 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6537 goto onError;
6538 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006539
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 end = s + size;
6541 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006542 unsigned char c = (unsigned char) *s++;
6543 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006544 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006545 Py_ssize_t startinpos;
6546 Py_ssize_t endinpos;
6547 const char *message;
6548
6549#define WRITE_CHAR(ch) \
6550 do { \
6551 if (ch <= writer.maxchar) { \
6552 assert(writer.pos < writer.size); \
6553 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6554 } \
6555 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6556 goto onError; \
6557 } \
6558 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006561 if (c != '\\' || s >= end) {
6562 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006565
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 c = (unsigned char) *s++;
6567 if (c == 'u') {
6568 count = 4;
6569 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 else if (c == 'U') {
6572 count = 8;
6573 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006574 }
6575 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006576 assert(writer.pos < writer.size);
6577 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6578 WRITE_CHAR(c);
6579 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006580 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 startinpos = s - starts - 2;
6582
6583 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6584 for (ch = 0; count && s < end; ++s, --count) {
6585 c = (unsigned char)*s;
6586 ch <<= 4;
6587 if (c >= '0' && c <= '9') {
6588 ch += c - '0';
6589 }
6590 else if (c >= 'a' && c <= 'f') {
6591 ch += c - ('a' - 10);
6592 }
6593 else if (c >= 'A' && c <= 'F') {
6594 ch += c - ('A' - 10);
6595 }
6596 else {
6597 break;
6598 }
6599 }
6600 if (!count) {
6601 if (ch <= MAX_UNICODE) {
6602 WRITE_CHAR(ch);
6603 continue;
6604 }
6605 message = "\\Uxxxxxxxx out of range";
6606 }
6607
6608 endinpos = s-starts;
6609 writer.min_length = end - s + writer.pos;
6610 if (unicode_decode_call_errorhandler_writer(
6611 errors, &errorHandler,
6612 "rawunicodeescape", message,
6613 &starts, &end, &startinpos, &endinpos, &exc, &s,
6614 &writer)) {
6615 goto onError;
6616 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006617 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006618
6619#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006621 Py_XDECREF(errorHandler);
6622 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006623 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006624
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006626 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006627 Py_XDECREF(errorHandler);
6628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633
Alexander Belopolsky40018472011-02-26 01:02:56 +00006634PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006635PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636{
Victor Stinner62ec3312016-09-06 17:04:34 -07006637 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006639 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006640 int kind;
6641 void *data;
6642 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 if (!PyUnicode_Check(unicode)) {
6645 PyErr_BadArgument();
6646 return NULL;
6647 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006648 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006649 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006650 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006651 kind = PyUnicode_KIND(unicode);
6652 data = PyUnicode_DATA(unicode);
6653 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 if (kind == PyUnicode_1BYTE_KIND) {
6655 return PyBytes_FromStringAndSize(data, len);
6656 }
Victor Stinner0e368262011-11-10 20:12:49 +01006657
Victor Stinner62ec3312016-09-06 17:04:34 -07006658 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6659 bytes, and 1 byte characters 4. */
6660 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006661
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 if (len > PY_SSIZE_T_MAX / expandsize) {
6663 return PyErr_NoMemory();
6664 }
6665 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6666 if (repr == NULL) {
6667 return NULL;
6668 }
6669 if (len == 0) {
6670 return repr;
6671 }
6672
6673 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006674 for (pos = 0; pos < len; pos++) {
6675 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006676
Victor Stinner62ec3312016-09-06 17:04:34 -07006677 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6678 if (ch < 0x100) {
6679 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006680 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006681 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006682 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 *p++ = '\\';
6684 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006685 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6686 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6687 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6688 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006690 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6691 else {
6692 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6693 *p++ = '\\';
6694 *p++ = 'U';
6695 *p++ = '0';
6696 *p++ = '0';
6697 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6698 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6699 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6700 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6701 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6702 *p++ = Py_hexdigits[ch & 15];
6703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006705
Victor Stinner62ec3312016-09-06 17:04:34 -07006706 assert(p > PyBytes_AS_STRING(repr));
6707 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6708 return NULL;
6709 }
6710 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711}
6712
Alexander Belopolsky40018472011-02-26 01:02:56 +00006713PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006714PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6715 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006717 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006718 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006719 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006720 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006721 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6722 Py_DECREF(tmp);
6723 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724}
6725
6726/* --- Latin-1 Codec ------------------------------------------------------ */
6727
Alexander Belopolsky40018472011-02-26 01:02:56 +00006728PyObject *
6729PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006730 Py_ssize_t size,
6731 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006734 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735}
6736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006738static void
6739make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006740 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006741 PyObject *unicode,
6742 Py_ssize_t startpos, Py_ssize_t endpos,
6743 const char *reason)
6744{
6745 if (*exceptionObject == NULL) {
6746 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006747 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006748 encoding, unicode, startpos, endpos, reason);
6749 }
6750 else {
6751 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6752 goto onError;
6753 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6754 goto onError;
6755 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6756 goto onError;
6757 return;
6758 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006759 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006760 }
6761}
6762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006764static void
6765raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006766 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006767 PyObject *unicode,
6768 Py_ssize_t startpos, Py_ssize_t endpos,
6769 const char *reason)
6770{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006771 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006772 encoding, unicode, startpos, endpos, reason);
6773 if (*exceptionObject != NULL)
6774 PyCodec_StrictErrors(*exceptionObject);
6775}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776
6777/* error handling callback helper:
6778 build arguments, call the callback and check the arguments,
6779 put the result into newpos and return the replacement string, which
6780 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006781static PyObject *
6782unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006783 PyObject **errorHandler,
6784 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006786 Py_ssize_t startpos, Py_ssize_t endpos,
6787 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006788{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006789 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006791 PyObject *restuple;
6792 PyObject *resunicode;
6793
6794 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006795 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006796 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006798 }
6799
Benjamin Petersonbac79492012-01-14 13:34:47 -05006800 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 return NULL;
6802 len = PyUnicode_GET_LENGTH(unicode);
6803
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006804 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006808
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006809 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006813 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 Py_DECREF(restuple);
6815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006817 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 &resunicode, newpos)) {
6819 Py_DECREF(restuple);
6820 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006821 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006822 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6823 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6824 Py_DECREF(restuple);
6825 return NULL;
6826 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006827 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006828 *newpos = len + *newpos;
6829 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006830 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 Py_DECREF(restuple);
6832 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006833 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006834 Py_INCREF(resunicode);
6835 Py_DECREF(restuple);
6836 return resunicode;
6837}
6838
Alexander Belopolsky40018472011-02-26 01:02:56 +00006839static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006840unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006841 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006842 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006843{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 /* input state */
6845 Py_ssize_t pos=0, size;
6846 int kind;
6847 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 /* pointer into the output */
6849 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006850 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6851 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006852 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006854 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006855 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006856 /* output object */
6857 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858
Benjamin Petersonbac79492012-01-14 13:34:47 -05006859 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 return NULL;
6861 size = PyUnicode_GET_LENGTH(unicode);
6862 kind = PyUnicode_KIND(unicode);
6863 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006864 /* allocate enough for a simple encoding without
6865 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006866 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006867 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006868
6869 _PyBytesWriter_Init(&writer);
6870 str = _PyBytesWriter_Alloc(&writer, size);
6871 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006872 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006873
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006874 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006875 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006876
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006878 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006880 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006884 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006886 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006887 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006888 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006889
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006890 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006892
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006893 /* Only overallocate the buffer if it's not the last write */
6894 writer.overallocate = (collend < size);
6895
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006897 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006898 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006899
6900 switch (error_handler) {
6901 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006902 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006904
6905 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006906 memset(str, '?', collend - collstart);
6907 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006908 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006909 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006910 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 break;
Victor Stinner50149202015-09-22 00:26:54 +02006912
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006913 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006914 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006915 writer.min_size -= (collend - collstart);
6916 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006917 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006918 if (str == NULL)
6919 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006920 pos = collend;
6921 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006922
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006923 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006924 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006925 writer.min_size -= (collend - collstart);
6926 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006927 unicode, collstart, collend);
6928 if (str == NULL)
6929 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 break;
Victor Stinner50149202015-09-22 00:26:54 +02006932
Victor Stinnerc3713e92015-09-29 12:32:13 +02006933 case _Py_ERROR_SURROGATEESCAPE:
6934 for (i = collstart; i < collend; ++i) {
6935 ch = PyUnicode_READ(kind, data, i);
6936 if (ch < 0xdc80 || 0xdcff < ch) {
6937 /* Not a UTF-8b surrogate */
6938 break;
6939 }
6940 *str++ = (char)(ch - 0xdc00);
6941 ++pos;
6942 }
6943 if (i >= collend)
6944 break;
6945 collstart = pos;
6946 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006947 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006948
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006950 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6951 encoding, reason, unicode, &exc,
6952 collstart, collend, &newpos);
6953 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006955
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006956 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006957 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006958
Victor Stinner6bd525b2015-10-09 13:10:05 +02006959 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006960 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006961 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006962 PyBytes_AS_STRING(rep),
6963 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006964 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006965 else {
6966 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006967
Victor Stinner6bd525b2015-10-09 13:10:05 +02006968 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006970
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006971 if (limit == 256 ?
6972 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6973 !PyUnicode_IS_ASCII(rep))
6974 {
6975 /* Not all characters are smaller than limit */
6976 raise_encode_exception(&exc, encoding, unicode,
6977 collstart, collend, reason);
6978 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006980 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6981 str = _PyBytesWriter_WriteBytes(&writer, str,
6982 PyUnicode_DATA(rep),
6983 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006985 if (str == NULL)
6986 goto onError;
6987
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006988 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006989 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006991
6992 /* If overallocation was disabled, ensure that it was the last
6993 write. Otherwise, we missed an optimization */
6994 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006995 }
6996 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006997
Victor Stinner50149202015-09-22 00:26:54 +02006998 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006999 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007000 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007001
7002 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007003 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007004 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007005 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007006 Py_XDECREF(exc);
7007 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007008}
7009
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007010/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007011PyObject *
7012PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007013 Py_ssize_t size,
7014 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007016 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007017 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018 if (unicode == NULL)
7019 return NULL;
7020 result = unicode_encode_ucs1(unicode, errors, 256);
7021 Py_DECREF(unicode);
7022 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
7028 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 PyErr_BadArgument();
7030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007032 if (PyUnicode_READY(unicode) == -1)
7033 return NULL;
7034 /* Fast path: if it is a one-byte string, construct
7035 bytes object directly. */
7036 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7037 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7038 PyUnicode_GET_LENGTH(unicode));
7039 /* Non-Latin-1 characters present. Defer to above function to
7040 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007041 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007042}
7043
7044PyObject*
7045PyUnicode_AsLatin1String(PyObject *unicode)
7046{
7047 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048}
7049
7050/* --- 7-bit ASCII Codec -------------------------------------------------- */
7051
Alexander Belopolsky40018472011-02-26 01:02:56 +00007052PyObject *
7053PyUnicode_DecodeASCII(const char *s,
7054 Py_ssize_t size,
7055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007057 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007058 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007059 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007061 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007062
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007064 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007065
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007067 if (size == 1 && (unsigned char)s[0] < 128)
7068 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007069
Inada Naoki770847a2019-06-24 12:30:24 +09007070 // Shortcut for simple case
7071 PyObject *u = PyUnicode_New(size, 127);
7072 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007073 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007074 }
7075 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7076 if (outpos == size) {
7077 return u;
7078 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007079
Inada Naoki770847a2019-06-24 12:30:24 +09007080 _PyUnicodeWriter writer;
7081 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007082 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007083
Inada Naoki770847a2019-06-24 12:30:24 +09007084 s += outpos;
7085 int kind = writer.kind;
7086 void *data = writer.data;
7087 Py_ssize_t startinpos, endinpos;
7088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007089 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007090 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007091 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007092 PyUnicode_WRITE(kind, data, writer.pos, c);
7093 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007095 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007097
7098 /* byte outsize range 0x00..0x7f: call the error handler */
7099
7100 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007101 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007102
7103 switch (error_handler)
7104 {
7105 case _Py_ERROR_REPLACE:
7106 case _Py_ERROR_SURROGATEESCAPE:
7107 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007108 but we may switch to UCS2 at the first write */
7109 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7110 goto onError;
7111 kind = writer.kind;
7112 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007113
7114 if (error_handler == _Py_ERROR_REPLACE)
7115 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7116 else
7117 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7118 writer.pos++;
7119 ++s;
7120 break;
7121
7122 case _Py_ERROR_IGNORE:
7123 ++s;
7124 break;
7125
7126 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 startinpos = s-starts;
7128 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007129 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007130 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 "ascii", "ordinal not in range(128)",
7132 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007133 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007135 kind = writer.kind;
7136 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007139 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007140 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007141 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007142
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007144 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007145 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147 return NULL;
7148}
7149
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007150/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007151PyObject *
7152PyUnicode_EncodeASCII(const Py_UNICODE *p,
7153 Py_ssize_t size,
7154 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007156 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007157 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007158 if (unicode == NULL)
7159 return NULL;
7160 result = unicode_encode_ucs1(unicode, errors, 128);
7161 Py_DECREF(unicode);
7162 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163}
7164
Alexander Belopolsky40018472011-02-26 01:02:56 +00007165PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007166_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167{
7168 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 PyErr_BadArgument();
7170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 if (PyUnicode_READY(unicode) == -1)
7173 return NULL;
7174 /* Fast path: if it is an ASCII-only string, construct bytes object
7175 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007176 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007177 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7178 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007179 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007180}
7181
7182PyObject *
7183PyUnicode_AsASCIIString(PyObject *unicode)
7184{
7185 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186}
7187
Steve Dowercc16be82016-09-08 10:35:16 -07007188#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007189
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007190/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007191
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007192#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193#define NEED_RETRY
7194#endif
7195
Steve Dower7ebdda02019-08-21 16:22:33 -07007196/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7197 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7198 both cases also and avoids partial characters overrunning the
7199 length limit in MultiByteToWideChar on Windows */
7200#define DECODING_CHUNK_SIZE (INT_MAX/4)
7201
Victor Stinner3a50e702011-10-18 21:21:00 +02007202#ifndef WC_ERR_INVALID_CHARS
7203# define WC_ERR_INVALID_CHARS 0x0080
7204#endif
7205
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007206static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007207code_page_name(UINT code_page, PyObject **obj)
7208{
7209 *obj = NULL;
7210 if (code_page == CP_ACP)
7211 return "mbcs";
7212 if (code_page == CP_UTF7)
7213 return "CP_UTF7";
7214 if (code_page == CP_UTF8)
7215 return "CP_UTF8";
7216
7217 *obj = PyBytes_FromFormat("cp%u", code_page);
7218 if (*obj == NULL)
7219 return NULL;
7220 return PyBytes_AS_STRING(*obj);
7221}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222
Victor Stinner3a50e702011-10-18 21:21:00 +02007223static DWORD
7224decode_code_page_flags(UINT code_page)
7225{
7226 if (code_page == CP_UTF7) {
7227 /* The CP_UTF7 decoder only supports flags=0 */
7228 return 0;
7229 }
7230 else
7231 return MB_ERR_INVALID_CHARS;
7232}
7233
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 * Decode a byte string from a Windows code page into unicode object in strict
7236 * mode.
7237 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007238 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7239 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007240 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007241static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007242decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007243 wchar_t **buf,
7244 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 const char *in,
7246 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007247{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007248 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007249 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007251
7252 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007254 while ((outsize = MultiByteToWideChar(code_page, flags,
7255 in, insize, NULL, 0)) <= 0)
7256 {
7257 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7258 goto error;
7259 }
7260 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7261 flags = 0;
7262 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007264 /* Extend a wchar_t* buffer */
7265 Py_ssize_t n = *bufsize; /* Get the current length */
7266 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7267 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007268 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007269 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007270
7271 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7273 if (outsize <= 0)
7274 goto error;
7275 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007276
Victor Stinner3a50e702011-10-18 21:21:00 +02007277error:
7278 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7279 return -2;
7280 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007281 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282}
7283
Victor Stinner3a50e702011-10-18 21:21:00 +02007284/*
7285 * Decode a byte string from a code page into unicode object with an error
7286 * handler.
7287 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007288 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 * UnicodeDecodeError exception and returns -1 on error.
7290 */
7291static int
7292decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007293 wchar_t **buf,
7294 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007296 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007297{
7298 const char *startin = in;
7299 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007300 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 /* Ideally, we should get reason from FormatMessage. This is the Windows
7302 2000 English version of the message. */
7303 const char *reason = "No mapping for the Unicode character exists "
7304 "in the target code page.";
7305 /* each step cannot decode more than 1 character, but a character can be
7306 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007307 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007308 int insize;
7309 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 PyObject *errorHandler = NULL;
7311 PyObject *exc = NULL;
7312 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007313 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 DWORD err;
7315 int ret = -1;
7316
7317 assert(size > 0);
7318
7319 encoding = code_page_name(code_page, &encoding_obj);
7320 if (encoding == NULL)
7321 return -1;
7322
Victor Stinner7d00cc12014-03-17 23:08:06 +01007323 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7325 UnicodeDecodeError. */
7326 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7327 if (exc != NULL) {
7328 PyCodec_StrictErrors(exc);
7329 Py_CLEAR(exc);
7330 }
7331 goto error;
7332 }
7333
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007334 /* Extend a wchar_t* buffer */
7335 Py_ssize_t n = *bufsize; /* Get the current length */
7336 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7337 PyErr_NoMemory();
7338 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007340 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7341 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007343 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007344
7345 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 while (in < endin)
7347 {
7348 /* Decode a character */
7349 insize = 1;
7350 do
7351 {
7352 outsize = MultiByteToWideChar(code_page, flags,
7353 in, insize,
7354 buffer, Py_ARRAY_LENGTH(buffer));
7355 if (outsize > 0)
7356 break;
7357 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007358 if (err == ERROR_INVALID_FLAGS && flags) {
7359 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7360 flags = 0;
7361 continue;
7362 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007363 if (err != ERROR_NO_UNICODE_TRANSLATION
7364 && err != ERROR_INSUFFICIENT_BUFFER)
7365 {
7366 PyErr_SetFromWindowsErr(0);
7367 goto error;
7368 }
7369 insize++;
7370 }
7371 /* 4=maximum length of a UTF-8 sequence */
7372 while (insize <= 4 && (in + insize) <= endin);
7373
7374 if (outsize <= 0) {
7375 Py_ssize_t startinpos, endinpos, outpos;
7376
Victor Stinner7d00cc12014-03-17 23:08:06 +01007377 /* last character in partial decode? */
7378 if (in + insize >= endin && !final)
7379 break;
7380
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 startinpos = in - startin;
7382 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007384 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 errors, &errorHandler,
7386 encoding, reason,
7387 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007388 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 {
7390 goto error;
7391 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007392 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 }
7394 else {
7395 in += insize;
7396 memcpy(out, buffer, outsize * sizeof(wchar_t));
7397 out += outsize;
7398 }
7399 }
7400
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007401 /* Shrink the buffer */
7402 assert(out - *buf <= *bufsize);
7403 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007404 /* (in - startin) <= size and size is an int */
7405 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007406
7407error:
7408 Py_XDECREF(encoding_obj);
7409 Py_XDECREF(errorHandler);
7410 Py_XDECREF(exc);
7411 return ret;
7412}
7413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414static PyObject *
7415decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007416 const char *s, Py_ssize_t size,
7417 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007419 wchar_t *buf = NULL;
7420 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007421 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422
Victor Stinner3a50e702011-10-18 21:21:00 +02007423 if (code_page < 0) {
7424 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7425 return NULL;
7426 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007427 if (size < 0) {
7428 PyErr_BadInternalCall();
7429 return NULL;
7430 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007431
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007434
Victor Stinner76a31a62011-11-04 00:05:13 +01007435 do
7436 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007437#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007438 if (size > DECODING_CHUNK_SIZE) {
7439 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007440 final = 0;
7441 done = 0;
7442 }
7443 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007445 {
7446 chunk_size = (int)size;
7447 final = (consumed == NULL);
7448 done = 1;
7449 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007450
Victor Stinner76a31a62011-11-04 00:05:13 +01007451 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007452 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007454 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007456
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007457 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007458 s, chunk_size);
7459 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007460 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007461 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007462 errors, final);
7463 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007464
7465 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007466 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007467 return NULL;
7468 }
7469
7470 if (consumed)
7471 *consumed += converted;
7472
7473 s += converted;
7474 size -= converted;
7475 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007476
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007477 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7478 PyMem_Free(buf);
7479 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480}
7481
Alexander Belopolsky40018472011-02-26 01:02:56 +00007482PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007483PyUnicode_DecodeCodePageStateful(int code_page,
7484 const char *s,
7485 Py_ssize_t size,
7486 const char *errors,
7487 Py_ssize_t *consumed)
7488{
7489 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7490}
7491
7492PyObject *
7493PyUnicode_DecodeMBCSStateful(const char *s,
7494 Py_ssize_t size,
7495 const char *errors,
7496 Py_ssize_t *consumed)
7497{
7498 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7499}
7500
7501PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007502PyUnicode_DecodeMBCS(const char *s,
7503 Py_ssize_t size,
7504 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007505{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007506 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7507}
7508
Victor Stinner3a50e702011-10-18 21:21:00 +02007509static DWORD
7510encode_code_page_flags(UINT code_page, const char *errors)
7511{
7512 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007513 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 }
7515 else if (code_page == CP_UTF7) {
7516 /* CP_UTF7 only supports flags=0 */
7517 return 0;
7518 }
7519 else {
7520 if (errors != NULL && strcmp(errors, "replace") == 0)
7521 return 0;
7522 else
7523 return WC_NO_BEST_FIT_CHARS;
7524 }
7525}
7526
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 * Encode a Unicode string to a Windows code page into a byte string in strict
7529 * mode.
7530 *
7531 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007532 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007533 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007534static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007535encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007536 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007538{
Victor Stinner554f3f02010-06-16 23:33:54 +00007539 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 BOOL *pusedDefaultChar = &usedDefaultChar;
7541 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007542 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007543 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 const DWORD flags = encode_code_page_flags(code_page, NULL);
7545 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007546 /* Create a substring so that we can get the UTF-16 representation
7547 of just the slice under consideration. */
7548 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549
Martin v. Löwis3d325192011-11-04 18:23:06 +01007550 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007551
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007553 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007555 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007556
Victor Stinner2fc507f2011-11-04 20:06:39 +01007557 substring = PyUnicode_Substring(unicode, offset, offset+len);
7558 if (substring == NULL)
7559 return -1;
7560 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7561 if (p == NULL) {
7562 Py_DECREF(substring);
7563 return -1;
7564 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007565 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007567 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007568 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007569 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 NULL, 0,
7571 NULL, pusedDefaultChar);
7572 if (outsize <= 0)
7573 goto error;
7574 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007575 if (pusedDefaultChar && *pusedDefaultChar) {
7576 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007578 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007579
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007582 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007583 if (*outbytes == NULL) {
7584 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588 }
7589 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 const Py_ssize_t n = PyBytes_Size(*outbytes);
7592 if (outsize > PY_SSIZE_T_MAX - n) {
7593 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007594 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007597 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7598 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007600 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007601 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602 }
7603
7604 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007606 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 out, outsize,
7608 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007609 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007610 if (outsize <= 0)
7611 goto error;
7612 if (pusedDefaultChar && *pusedDefaultChar)
7613 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007614 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007615
Victor Stinner3a50e702011-10-18 21:21:00 +02007616error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007617 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7619 return -2;
7620 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007621 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007622}
7623
Victor Stinner3a50e702011-10-18 21:21:00 +02007624/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007625 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 * error handler.
7627 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007628 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 * -1 on other error.
7630 */
7631static int
7632encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007633 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007634 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007635{
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637 Py_ssize_t pos = unicode_offset;
7638 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007639 /* Ideally, we should get reason from FormatMessage. This is the Windows
7640 2000 English version of the message. */
7641 const char *reason = "invalid character";
7642 /* 4=maximum length of a UTF-8 sequence */
7643 char buffer[4];
7644 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7645 Py_ssize_t outsize;
7646 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 PyObject *errorHandler = NULL;
7648 PyObject *exc = NULL;
7649 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007650 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 PyObject *rep;
7653 int ret = -1;
7654
7655 assert(insize > 0);
7656
7657 encoding = code_page_name(code_page, &encoding_obj);
7658 if (encoding == NULL)
7659 return -1;
7660
7661 if (errors == NULL || strcmp(errors, "strict") == 0) {
7662 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7663 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007664 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007665 if (exc != NULL) {
7666 PyCodec_StrictErrors(exc);
7667 Py_DECREF(exc);
7668 }
7669 Py_XDECREF(encoding_obj);
7670 return -1;
7671 }
7672
7673 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7674 pusedDefaultChar = &usedDefaultChar;
7675 else
7676 pusedDefaultChar = NULL;
7677
7678 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7679 PyErr_NoMemory();
7680 goto error;
7681 }
7682 outsize = insize * Py_ARRAY_LENGTH(buffer);
7683
7684 if (*outbytes == NULL) {
7685 /* Create string object */
7686 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7687 if (*outbytes == NULL)
7688 goto error;
7689 out = PyBytes_AS_STRING(*outbytes);
7690 }
7691 else {
7692 /* Extend string object */
7693 Py_ssize_t n = PyBytes_Size(*outbytes);
7694 if (n > PY_SSIZE_T_MAX - outsize) {
7695 PyErr_NoMemory();
7696 goto error;
7697 }
7698 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7699 goto error;
7700 out = PyBytes_AS_STRING(*outbytes) + n;
7701 }
7702
7703 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007704 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007705 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007706 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7707 wchar_t chars[2];
7708 int charsize;
7709 if (ch < 0x10000) {
7710 chars[0] = (wchar_t)ch;
7711 charsize = 1;
7712 }
7713 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007714 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7715 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007716 charsize = 2;
7717 }
7718
Victor Stinner3a50e702011-10-18 21:21:00 +02007719 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007720 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 buffer, Py_ARRAY_LENGTH(buffer),
7722 NULL, pusedDefaultChar);
7723 if (outsize > 0) {
7724 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7725 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007726 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 memcpy(out, buffer, outsize);
7728 out += outsize;
7729 continue;
7730 }
7731 }
7732 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7733 PyErr_SetFromWindowsErr(0);
7734 goto error;
7735 }
7736
Victor Stinner3a50e702011-10-18 21:21:00 +02007737 rep = unicode_encode_call_errorhandler(
7738 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007740 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007741 if (rep == NULL)
7742 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007744
7745 if (PyBytes_Check(rep)) {
7746 outsize = PyBytes_GET_SIZE(rep);
7747 if (outsize != 1) {
7748 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7749 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7750 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7751 Py_DECREF(rep);
7752 goto error;
7753 }
7754 out = PyBytes_AS_STRING(*outbytes) + offset;
7755 }
7756 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7757 out += outsize;
7758 }
7759 else {
7760 Py_ssize_t i;
7761 enum PyUnicode_Kind kind;
7762 void *data;
7763
Benjamin Petersonbac79492012-01-14 13:34:47 -05007764 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007765 Py_DECREF(rep);
7766 goto error;
7767 }
7768
7769 outsize = PyUnicode_GET_LENGTH(rep);
7770 if (outsize != 1) {
7771 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7772 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7773 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7774 Py_DECREF(rep);
7775 goto error;
7776 }
7777 out = PyBytes_AS_STRING(*outbytes) + offset;
7778 }
7779 kind = PyUnicode_KIND(rep);
7780 data = PyUnicode_DATA(rep);
7781 for (i=0; i < outsize; i++) {
7782 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7783 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007784 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007785 encoding, unicode,
7786 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007787 "unable to encode error handler result to ASCII");
7788 Py_DECREF(rep);
7789 goto error;
7790 }
7791 *out = (unsigned char)ch;
7792 out++;
7793 }
7794 }
7795 Py_DECREF(rep);
7796 }
7797 /* write a NUL byte */
7798 *out = 0;
7799 outsize = out - PyBytes_AS_STRING(*outbytes);
7800 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7801 if (_PyBytes_Resize(outbytes, outsize) < 0)
7802 goto error;
7803 ret = 0;
7804
7805error:
7806 Py_XDECREF(encoding_obj);
7807 Py_XDECREF(errorHandler);
7808 Py_XDECREF(exc);
7809 return ret;
7810}
7811
Victor Stinner3a50e702011-10-18 21:21:00 +02007812static PyObject *
7813encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007814 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 const char *errors)
7816{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007817 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007818 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007819 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007820 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007821
Victor Stinner29dacf22015-01-26 16:41:32 +01007822 if (!PyUnicode_Check(unicode)) {
7823 PyErr_BadArgument();
7824 return NULL;
7825 }
7826
Benjamin Petersonbac79492012-01-14 13:34:47 -05007827 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007828 return NULL;
7829 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007830
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 if (code_page < 0) {
7832 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7833 return NULL;
7834 }
7835
Martin v. Löwis3d325192011-11-04 18:23:06 +01007836 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007837 return PyBytes_FromStringAndSize(NULL, 0);
7838
Victor Stinner7581cef2011-11-03 22:32:33 +01007839 offset = 0;
7840 do
7841 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007842#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007843 if (len > DECODING_CHUNK_SIZE) {
7844 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007845 done = 0;
7846 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007847 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007848#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007849 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007850 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007851 done = 1;
7852 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007853
Victor Stinner76a31a62011-11-04 00:05:13 +01007854 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007855 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007856 errors);
7857 if (ret == -2)
7858 ret = encode_code_page_errors(code_page, &outbytes,
7859 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007860 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007861 if (ret < 0) {
7862 Py_XDECREF(outbytes);
7863 return NULL;
7864 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007865
Victor Stinner7581cef2011-11-03 22:32:33 +01007866 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007867 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007868 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007869
Victor Stinner3a50e702011-10-18 21:21:00 +02007870 return outbytes;
7871}
7872
7873PyObject *
7874PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7875 Py_ssize_t size,
7876 const char *errors)
7877{
Victor Stinner7581cef2011-11-03 22:32:33 +01007878 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007879 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007880 if (unicode == NULL)
7881 return NULL;
7882 res = encode_code_page(CP_ACP, unicode, errors);
7883 Py_DECREF(unicode);
7884 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007885}
7886
7887PyObject *
7888PyUnicode_EncodeCodePage(int code_page,
7889 PyObject *unicode,
7890 const char *errors)
7891{
Victor Stinner7581cef2011-11-03 22:32:33 +01007892 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007893}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007894
Alexander Belopolsky40018472011-02-26 01:02:56 +00007895PyObject *
7896PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007897{
Victor Stinner7581cef2011-11-03 22:32:33 +01007898 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007899}
7900
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007901#undef NEED_RETRY
7902
Steve Dowercc16be82016-09-08 10:35:16 -07007903#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007904
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905/* --- Character Mapping Codec -------------------------------------------- */
7906
Victor Stinnerfb161b12013-04-18 01:44:27 +02007907static int
7908charmap_decode_string(const char *s,
7909 Py_ssize_t size,
7910 PyObject *mapping,
7911 const char *errors,
7912 _PyUnicodeWriter *writer)
7913{
7914 const char *starts = s;
7915 const char *e;
7916 Py_ssize_t startinpos, endinpos;
7917 PyObject *errorHandler = NULL, *exc = NULL;
7918 Py_ssize_t maplen;
7919 enum PyUnicode_Kind mapkind;
7920 void *mapdata;
7921 Py_UCS4 x;
7922 unsigned char ch;
7923
7924 if (PyUnicode_READY(mapping) == -1)
7925 return -1;
7926
7927 maplen = PyUnicode_GET_LENGTH(mapping);
7928 mapdata = PyUnicode_DATA(mapping);
7929 mapkind = PyUnicode_KIND(mapping);
7930
7931 e = s + size;
7932
7933 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7934 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7935 * is disabled in encoding aliases, latin1 is preferred because
7936 * its implementation is faster. */
7937 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7938 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7939 Py_UCS4 maxchar = writer->maxchar;
7940
7941 assert (writer->kind == PyUnicode_1BYTE_KIND);
7942 while (s < e) {
7943 ch = *s;
7944 x = mapdata_ucs1[ch];
7945 if (x > maxchar) {
7946 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7947 goto onError;
7948 maxchar = writer->maxchar;
7949 outdata = (Py_UCS1 *)writer->data;
7950 }
7951 outdata[writer->pos] = x;
7952 writer->pos++;
7953 ++s;
7954 }
7955 return 0;
7956 }
7957
7958 while (s < e) {
7959 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7960 enum PyUnicode_Kind outkind = writer->kind;
7961 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7962 if (outkind == PyUnicode_1BYTE_KIND) {
7963 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7964 Py_UCS4 maxchar = writer->maxchar;
7965 while (s < e) {
7966 ch = *s;
7967 x = mapdata_ucs2[ch];
7968 if (x > maxchar)
7969 goto Error;
7970 outdata[writer->pos] = x;
7971 writer->pos++;
7972 ++s;
7973 }
7974 break;
7975 }
7976 else if (outkind == PyUnicode_2BYTE_KIND) {
7977 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7978 while (s < e) {
7979 ch = *s;
7980 x = mapdata_ucs2[ch];
7981 if (x == 0xFFFE)
7982 goto Error;
7983 outdata[writer->pos] = x;
7984 writer->pos++;
7985 ++s;
7986 }
7987 break;
7988 }
7989 }
7990 ch = *s;
7991
7992 if (ch < maplen)
7993 x = PyUnicode_READ(mapkind, mapdata, ch);
7994 else
7995 x = 0xfffe; /* invalid value */
7996Error:
7997 if (x == 0xfffe)
7998 {
7999 /* undefined mapping */
8000 startinpos = s-starts;
8001 endinpos = startinpos+1;
8002 if (unicode_decode_call_errorhandler_writer(
8003 errors, &errorHandler,
8004 "charmap", "character maps to <undefined>",
8005 &starts, &e, &startinpos, &endinpos, &exc, &s,
8006 writer)) {
8007 goto onError;
8008 }
8009 continue;
8010 }
8011
8012 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8013 goto onError;
8014 ++s;
8015 }
8016 Py_XDECREF(errorHandler);
8017 Py_XDECREF(exc);
8018 return 0;
8019
8020onError:
8021 Py_XDECREF(errorHandler);
8022 Py_XDECREF(exc);
8023 return -1;
8024}
8025
8026static int
8027charmap_decode_mapping(const char *s,
8028 Py_ssize_t size,
8029 PyObject *mapping,
8030 const char *errors,
8031 _PyUnicodeWriter *writer)
8032{
8033 const char *starts = s;
8034 const char *e;
8035 Py_ssize_t startinpos, endinpos;
8036 PyObject *errorHandler = NULL, *exc = NULL;
8037 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008038 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008039
8040 e = s + size;
8041
8042 while (s < e) {
8043 ch = *s;
8044
8045 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8046 key = PyLong_FromLong((long)ch);
8047 if (key == NULL)
8048 goto onError;
8049
8050 item = PyObject_GetItem(mapping, key);
8051 Py_DECREF(key);
8052 if (item == NULL) {
8053 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8054 /* No mapping found means: mapping is undefined. */
8055 PyErr_Clear();
8056 goto Undefined;
8057 } else
8058 goto onError;
8059 }
8060
8061 /* Apply mapping */
8062 if (item == Py_None)
8063 goto Undefined;
8064 if (PyLong_Check(item)) {
8065 long value = PyLong_AS_LONG(item);
8066 if (value == 0xFFFE)
8067 goto Undefined;
8068 if (value < 0 || value > MAX_UNICODE) {
8069 PyErr_Format(PyExc_TypeError,
8070 "character mapping must be in range(0x%lx)",
8071 (unsigned long)MAX_UNICODE + 1);
8072 goto onError;
8073 }
8074
8075 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8076 goto onError;
8077 }
8078 else if (PyUnicode_Check(item)) {
8079 if (PyUnicode_READY(item) == -1)
8080 goto onError;
8081 if (PyUnicode_GET_LENGTH(item) == 1) {
8082 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8083 if (value == 0xFFFE)
8084 goto Undefined;
8085 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8086 goto onError;
8087 }
8088 else {
8089 writer->overallocate = 1;
8090 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8091 goto onError;
8092 }
8093 }
8094 else {
8095 /* wrong return value */
8096 PyErr_SetString(PyExc_TypeError,
8097 "character mapping must return integer, None or str");
8098 goto onError;
8099 }
8100 Py_CLEAR(item);
8101 ++s;
8102 continue;
8103
8104Undefined:
8105 /* undefined mapping */
8106 Py_CLEAR(item);
8107 startinpos = s-starts;
8108 endinpos = startinpos+1;
8109 if (unicode_decode_call_errorhandler_writer(
8110 errors, &errorHandler,
8111 "charmap", "character maps to <undefined>",
8112 &starts, &e, &startinpos, &endinpos, &exc, &s,
8113 writer)) {
8114 goto onError;
8115 }
8116 }
8117 Py_XDECREF(errorHandler);
8118 Py_XDECREF(exc);
8119 return 0;
8120
8121onError:
8122 Py_XDECREF(item);
8123 Py_XDECREF(errorHandler);
8124 Py_XDECREF(exc);
8125 return -1;
8126}
8127
Alexander Belopolsky40018472011-02-26 01:02:56 +00008128PyObject *
8129PyUnicode_DecodeCharmap(const char *s,
8130 Py_ssize_t size,
8131 PyObject *mapping,
8132 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008134 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008135
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136 /* Default to Latin-1 */
8137 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008141 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008142 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008143 writer.min_length = size;
8144 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008146
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008147 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008148 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8149 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008150 }
8151 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008152 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8153 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008155 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008156
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008158 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 return NULL;
8160}
8161
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162/* Charmap encoding: the lookup table */
8163
Alexander Belopolsky40018472011-02-26 01:02:56 +00008164struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 PyObject_HEAD
8166 unsigned char level1[32];
8167 int count2, count3;
8168 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169};
8170
8171static PyObject*
8172encoding_map_size(PyObject *obj, PyObject* args)
8173{
8174 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177}
8178
8179static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 PyDoc_STR("Return the size (in bytes) of this object") },
8182 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183};
8184
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 "EncodingMap", /*tp_name*/
8188 sizeof(struct encoding_map), /*tp_basicsize*/
8189 0, /*tp_itemsize*/
8190 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008191 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008192 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 0, /*tp_getattr*/
8194 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008195 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 0, /*tp_repr*/
8197 0, /*tp_as_number*/
8198 0, /*tp_as_sequence*/
8199 0, /*tp_as_mapping*/
8200 0, /*tp_hash*/
8201 0, /*tp_call*/
8202 0, /*tp_str*/
8203 0, /*tp_getattro*/
8204 0, /*tp_setattro*/
8205 0, /*tp_as_buffer*/
8206 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8207 0, /*tp_doc*/
8208 0, /*tp_traverse*/
8209 0, /*tp_clear*/
8210 0, /*tp_richcompare*/
8211 0, /*tp_weaklistoffset*/
8212 0, /*tp_iter*/
8213 0, /*tp_iternext*/
8214 encoding_map_methods, /*tp_methods*/
8215 0, /*tp_members*/
8216 0, /*tp_getset*/
8217 0, /*tp_base*/
8218 0, /*tp_dict*/
8219 0, /*tp_descr_get*/
8220 0, /*tp_descr_set*/
8221 0, /*tp_dictoffset*/
8222 0, /*tp_init*/
8223 0, /*tp_alloc*/
8224 0, /*tp_new*/
8225 0, /*tp_free*/
8226 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008227};
8228
8229PyObject*
8230PyUnicode_BuildEncodingMap(PyObject* string)
8231{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008232 PyObject *result;
8233 struct encoding_map *mresult;
8234 int i;
8235 int need_dict = 0;
8236 unsigned char level1[32];
8237 unsigned char level2[512];
8238 unsigned char *mlevel1, *mlevel2, *mlevel3;
8239 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 int kind;
8241 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008242 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008244
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008245 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 PyErr_BadArgument();
8247 return NULL;
8248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008249 kind = PyUnicode_KIND(string);
8250 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008251 length = PyUnicode_GET_LENGTH(string);
8252 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 memset(level1, 0xFF, sizeof level1);
8254 memset(level2, 0xFF, sizeof level2);
8255
8256 /* If there isn't a one-to-one mapping of NULL to \0,
8257 or if there are non-BMP characters, we need to use
8258 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008261 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 ch = PyUnicode_READ(kind, data, i);
8264 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008265 need_dict = 1;
8266 break;
8267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269 /* unmapped character */
8270 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 l1 = ch >> 11;
8272 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 if (level1[l1] == 0xFF)
8274 level1[l1] = count2++;
8275 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277 }
8278
8279 if (count2 >= 0xFF || count3 >= 0xFF)
8280 need_dict = 1;
8281
8282 if (need_dict) {
8283 PyObject *result = PyDict_New();
8284 PyObject *key, *value;
8285 if (!result)
8286 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008287 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008289 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290 if (!key || !value)
8291 goto failed1;
8292 if (PyDict_SetItem(result, key, value) == -1)
8293 goto failed1;
8294 Py_DECREF(key);
8295 Py_DECREF(value);
8296 }
8297 return result;
8298 failed1:
8299 Py_XDECREF(key);
8300 Py_XDECREF(value);
8301 Py_DECREF(result);
8302 return NULL;
8303 }
8304
8305 /* Create a three-level trie */
8306 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8307 16*count2 + 128*count3 - 1);
8308 if (!result)
8309 return PyErr_NoMemory();
8310 PyObject_Init(result, &EncodingMapType);
8311 mresult = (struct encoding_map*)result;
8312 mresult->count2 = count2;
8313 mresult->count3 = count3;
8314 mlevel1 = mresult->level1;
8315 mlevel2 = mresult->level23;
8316 mlevel3 = mresult->level23 + 16*count2;
8317 memcpy(mlevel1, level1, 32);
8318 memset(mlevel2, 0xFF, 16*count2);
8319 memset(mlevel3, 0, 128*count3);
8320 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008321 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008322 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008323 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8324 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 /* unmapped character */
8326 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008327 o1 = ch>>11;
8328 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 i2 = 16*mlevel1[o1] + o2;
8330 if (mlevel2[i2] == 0xFF)
8331 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008332 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 i3 = 128*mlevel2[i2] + o3;
8334 mlevel3[i3] = i;
8335 }
8336 return result;
8337}
8338
8339static int
Victor Stinner22168992011-11-20 17:09:18 +01008340encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341{
8342 struct encoding_map *map = (struct encoding_map*)mapping;
8343 int l1 = c>>11;
8344 int l2 = (c>>7) & 0xF;
8345 int l3 = c & 0x7F;
8346 int i;
8347
Victor Stinner22168992011-11-20 17:09:18 +01008348 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350 if (c == 0)
8351 return 0;
8352 /* level 1*/
8353 i = map->level1[l1];
8354 if (i == 0xFF) {
8355 return -1;
8356 }
8357 /* level 2*/
8358 i = map->level23[16*i+l2];
8359 if (i == 0xFF) {
8360 return -1;
8361 }
8362 /* level 3 */
8363 i = map->level23[16*map->count2 + 128*i + l3];
8364 if (i == 0) {
8365 return -1;
8366 }
8367 return i;
8368}
8369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370/* Lookup the character ch in the mapping. If the character
8371 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008372 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008374charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Christian Heimes217cfd12007-12-02 14:31:20 +00008376 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 PyObject *x;
8378
8379 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 x = PyObject_GetItem(mapping, w);
8382 Py_DECREF(w);
8383 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8385 /* No mapping found means: mapping is undefined. */
8386 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008387 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 } else
8389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008391 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008393 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 long value = PyLong_AS_LONG(x);
8395 if (value < 0 || value > 255) {
8396 PyErr_SetString(PyExc_TypeError,
8397 "character mapping must be in range(256)");
8398 Py_DECREF(x);
8399 return NULL;
8400 }
8401 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008403 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 /* wrong return value */
8407 PyErr_Format(PyExc_TypeError,
8408 "character mapping must return integer, bytes or None, not %.400s",
8409 x->ob_type->tp_name);
8410 Py_DECREF(x);
8411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 }
8413}
8414
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008415static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008416charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008417{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8419 /* exponentially overallocate to minimize reallocations */
8420 if (requiredsize < 2*outsize)
8421 requiredsize = 2*outsize;
8422 if (_PyBytes_Resize(outobj, requiredsize))
8423 return -1;
8424 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008425}
8426
Benjamin Peterson14339b62009-01-31 16:36:08 +00008427typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008429} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008431 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 space is available. Return a new reference to the object that
8433 was put in the output buffer, or Py_None, if the mapping was undefined
8434 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008435 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008436static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008437charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008438 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008440 PyObject *rep;
8441 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008442 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443
Christian Heimes90aa7642007-12-19 02:45:37 +00008444 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008447 if (res == -1)
8448 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 if (outsize<requiredsize)
8450 if (charmapencode_resize(outobj, outpos, requiredsize))
8451 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008452 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 outstart[(*outpos)++] = (char)res;
8454 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 }
8456
8457 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008460 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 Py_DECREF(rep);
8462 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008463 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 if (PyLong_Check(rep)) {
8465 Py_ssize_t requiredsize = *outpos+1;
8466 if (outsize<requiredsize)
8467 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8468 Py_DECREF(rep);
8469 return enc_EXCEPTION;
8470 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008471 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 else {
8475 const char *repchars = PyBytes_AS_STRING(rep);
8476 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8477 Py_ssize_t requiredsize = *outpos+repsize;
8478 if (outsize<requiredsize)
8479 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8480 Py_DECREF(rep);
8481 return enc_EXCEPTION;
8482 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008483 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 memcpy(outstart + *outpos, repchars, repsize);
8485 *outpos += repsize;
8486 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008488 Py_DECREF(rep);
8489 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490}
8491
8492/* handle an error in PyUnicode_EncodeCharmap
8493 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008494static int
8495charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008496 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008497 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008498 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008499 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500{
8501 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008503 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008504 enum PyUnicode_Kind kind;
8505 void *data;
8506 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008508 Py_ssize_t collstartpos = *inpos;
8509 Py_ssize_t collendpos = *inpos+1;
8510 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008511 const char *encoding = "charmap";
8512 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008513 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008514 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008515 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516
Benjamin Petersonbac79492012-01-14 13:34:47 -05008517 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008518 return -1;
8519 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 /* find all unencodable characters */
8521 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008522 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008523 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008524 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008525 val = encoding_map_lookup(ch, mapping);
8526 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 break;
8528 ++collendpos;
8529 continue;
8530 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008531
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8533 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 if (rep==NULL)
8535 return -1;
8536 else if (rep!=Py_None) {
8537 Py_DECREF(rep);
8538 break;
8539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008540 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008542 }
8543 /* cache callback name lookup
8544 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008545 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008546 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008547
8548 switch (*error_handler) {
8549 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008550 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008551 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008552
8553 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008554 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 x = charmapencode_output('?', mapping, res, respos);
8556 if (x==enc_EXCEPTION) {
8557 return -1;
8558 }
8559 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008560 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 return -1;
8562 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008563 }
8564 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008565 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008566 *inpos = collendpos;
8567 break;
Victor Stinner50149202015-09-22 00:26:54 +02008568
8569 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008570 /* generate replacement (temporarily (mis)uses p) */
8571 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 char buffer[2+29+1+1];
8573 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008574 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 for (cp = buffer; *cp; ++cp) {
8576 x = charmapencode_output(*cp, mapping, res, respos);
8577 if (x==enc_EXCEPTION)
8578 return -1;
8579 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008580 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 return -1;
8582 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008583 }
8584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008585 *inpos = collendpos;
8586 break;
Victor Stinner50149202015-09-22 00:26:54 +02008587
Benjamin Peterson14339b62009-01-31 16:36:08 +00008588 default:
Victor Stinner50149202015-09-22 00:26:54 +02008589 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008592 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008594 if (PyBytes_Check(repunicode)) {
8595 /* Directly copy bytes result to output. */
8596 Py_ssize_t outsize = PyBytes_Size(*res);
8597 Py_ssize_t requiredsize;
8598 repsize = PyBytes_Size(repunicode);
8599 requiredsize = *respos + repsize;
8600 if (requiredsize > outsize)
8601 /* Make room for all additional bytes. */
8602 if (charmapencode_resize(res, respos, requiredsize)) {
8603 Py_DECREF(repunicode);
8604 return -1;
8605 }
8606 memcpy(PyBytes_AsString(*res) + *respos,
8607 PyBytes_AsString(repunicode), repsize);
8608 *respos += repsize;
8609 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008610 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008611 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008613 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008614 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008615 Py_DECREF(repunicode);
8616 return -1;
8617 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008618 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008619 data = PyUnicode_DATA(repunicode);
8620 kind = PyUnicode_KIND(repunicode);
8621 for (index = 0; index < repsize; index++) {
8622 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8623 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008625 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 return -1;
8627 }
8628 else if (x==enc_FAILED) {
8629 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008630 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 return -1;
8632 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 }
8634 *inpos = newpos;
8635 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 }
8637 return 0;
8638}
8639
Alexander Belopolsky40018472011-02-26 01:02:56 +00008640PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008641_PyUnicode_EncodeCharmap(PyObject *unicode,
8642 PyObject *mapping,
8643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008645 /* output object */
8646 PyObject *res = NULL;
8647 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008648 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008649 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008651 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008652 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008654 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008655 void *data;
8656 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657
Benjamin Petersonbac79492012-01-14 13:34:47 -05008658 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008659 return NULL;
8660 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008661 data = PyUnicode_DATA(unicode);
8662 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008663
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 /* Default to Latin-1 */
8665 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008666 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 /* allocate enough for a simple encoding without
8669 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008670 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 if (res == NULL)
8672 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008673 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008677 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008679 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 if (x==enc_EXCEPTION) /* error */
8681 goto onError;
8682 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008683 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008685 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 &res, &respos)) {
8687 goto onError;
8688 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 else
8691 /* done with this character => adjust input position */
8692 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008695 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008696 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008697 if (_PyBytes_Resize(&res, respos) < 0)
8698 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008701 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 return res;
8703
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 Py_XDECREF(res);
8706 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008707 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 return NULL;
8709}
8710
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008711/* Deprecated */
8712PyObject *
8713PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8714 Py_ssize_t size,
8715 PyObject *mapping,
8716 const char *errors)
8717{
8718 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008719 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008720 if (unicode == NULL)
8721 return NULL;
8722 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8723 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008724 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008725}
8726
Alexander Belopolsky40018472011-02-26 01:02:56 +00008727PyObject *
8728PyUnicode_AsCharmapString(PyObject *unicode,
8729 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730{
8731 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 PyErr_BadArgument();
8733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008735 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736}
8737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008738/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008739static void
8740make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008742 Py_ssize_t startpos, Py_ssize_t endpos,
8743 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 *exceptionObject = _PyUnicodeTranslateError_Create(
8747 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 }
8749 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8751 goto onError;
8752 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8753 goto onError;
8754 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8755 goto onError;
8756 return;
8757 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008758 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759 }
8760}
8761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008762/* error handling callback helper:
8763 build arguments, call the callback and check the arguments,
8764 put the result into newpos and return the replacement string, which
8765 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008766static PyObject *
8767unicode_translate_call_errorhandler(const char *errors,
8768 PyObject **errorHandler,
8769 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008771 Py_ssize_t startpos, Py_ssize_t endpos,
8772 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008773{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008774 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008776 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 PyObject *restuple;
8778 PyObject *resunicode;
8779
8780 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008782 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008784 }
8785
8786 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008791 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008795 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 Py_DECREF(restuple);
8797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008799 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 &resunicode, &i_newpos)) {
8801 Py_DECREF(restuple);
8802 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008803 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008804 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008806 else
8807 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008809 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 Py_DECREF(restuple);
8811 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008812 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008813 Py_INCREF(resunicode);
8814 Py_DECREF(restuple);
8815 return resunicode;
8816}
8817
8818/* Lookup the character ch in the mapping and put the result in result,
8819 which must be decrefed by the caller.
8820 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008821static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823{
Christian Heimes217cfd12007-12-02 14:31:20 +00008824 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 PyObject *x;
8826
8827 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 x = PyObject_GetItem(mapping, w);
8830 Py_DECREF(w);
8831 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8833 /* No mapping found means: use 1:1 mapping. */
8834 PyErr_Clear();
8835 *result = NULL;
8836 return 0;
8837 } else
8838 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008839 }
8840 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 *result = x;
8842 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008844 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008846 if (value < 0 || value > MAX_UNICODE) {
8847 PyErr_Format(PyExc_ValueError,
8848 "character mapping must be in range(0x%x)",
8849 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 Py_DECREF(x);
8851 return -1;
8852 }
8853 *result = x;
8854 return 0;
8855 }
8856 else if (PyUnicode_Check(x)) {
8857 *result = x;
8858 return 0;
8859 }
8860 else {
8861 /* wrong return value */
8862 PyErr_SetString(PyExc_TypeError,
8863 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008864 Py_DECREF(x);
8865 return -1;
8866 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867}
Victor Stinner1194ea02014-04-04 19:37:40 +02008868
8869/* lookup the character, write the result into the writer.
8870 Return 1 if the result was written into the writer, return 0 if the mapping
8871 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008872static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008873charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8874 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875{
Victor Stinner1194ea02014-04-04 19:37:40 +02008876 PyObject *item;
8877
8878 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008880
8881 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008883 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008886 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008887 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008888
8889 if (item == Py_None) {
8890 Py_DECREF(item);
8891 return 0;
8892 }
8893
8894 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008895 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8896 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8897 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008898 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8899 Py_DECREF(item);
8900 return -1;
8901 }
8902 Py_DECREF(item);
8903 return 1;
8904 }
8905
8906 if (!PyUnicode_Check(item)) {
8907 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008909 }
8910
8911 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8912 Py_DECREF(item);
8913 return -1;
8914 }
8915
8916 Py_DECREF(item);
8917 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008918}
8919
Victor Stinner89a76ab2014-04-05 11:44:04 +02008920static int
8921unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8922 Py_UCS1 *translate)
8923{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008924 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008925 int ret = 0;
8926
Victor Stinner89a76ab2014-04-05 11:44:04 +02008927 if (charmaptranslate_lookup(ch, mapping, &item)) {
8928 return -1;
8929 }
8930
8931 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008932 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008933 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008935 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936 /* not found => default to 1:1 mapping */
8937 translate[ch] = ch;
8938 return 1;
8939 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008940 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008941 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008942 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8943 used it */
8944 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008945 /* invalid character or character outside ASCII:
8946 skip the fast translate */
8947 goto exit;
8948 }
8949 translate[ch] = (Py_UCS1)replace;
8950 }
8951 else if (PyUnicode_Check(item)) {
8952 Py_UCS4 replace;
8953
8954 if (PyUnicode_READY(item) == -1) {
8955 Py_DECREF(item);
8956 return -1;
8957 }
8958 if (PyUnicode_GET_LENGTH(item) != 1)
8959 goto exit;
8960
8961 replace = PyUnicode_READ_CHAR(item, 0);
8962 if (replace > 127)
8963 goto exit;
8964 translate[ch] = (Py_UCS1)replace;
8965 }
8966 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008967 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 goto exit;
8969 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970 ret = 1;
8971
Benjamin Peterson1365de72014-04-07 20:15:41 -04008972 exit:
8973 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974 return ret;
8975}
8976
8977/* Fast path for ascii => ascii translation. Return 1 if the whole string
8978 was translated into writer, return 0 if the input string was partially
8979 translated into writer, raise an exception and return -1 on error. */
8980static int
8981unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008982 _PyUnicodeWriter *writer, int ignore,
8983 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008984{
Victor Stinner872b2912014-04-05 14:27:07 +02008985 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008986 Py_ssize_t len;
8987 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008988 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008989
Victor Stinner89a76ab2014-04-05 11:44:04 +02008990 len = PyUnicode_GET_LENGTH(input);
8991
Victor Stinner872b2912014-04-05 14:27:07 +02008992 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008993
8994 in = PyUnicode_1BYTE_DATA(input);
8995 end = in + len;
8996
8997 assert(PyUnicode_IS_ASCII(writer->buffer));
8998 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8999 out = PyUnicode_1BYTE_DATA(writer->buffer);
9000
Victor Stinner872b2912014-04-05 14:27:07 +02009001 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009002 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009003 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009004 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009005 int translate = unicode_fast_translate_lookup(mapping, ch,
9006 ascii_table);
9007 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009009 if (translate == 0)
9010 goto exit;
9011 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 }
Victor Stinner872b2912014-04-05 14:27:07 +02009013 if (ch2 == 0xfe) {
9014 if (ignore)
9015 continue;
9016 goto exit;
9017 }
9018 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009019 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009020 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009021 }
Victor Stinner872b2912014-04-05 14:27:07 +02009022 res = 1;
9023
9024exit:
9025 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009026 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009027 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009028}
9029
Victor Stinner3222da22015-10-01 22:07:32 +02009030static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031_PyUnicode_TranslateCharmap(PyObject *input,
9032 PyObject *mapping,
9033 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009036 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009037 Py_ssize_t size, i;
9038 int kind;
9039 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 _PyUnicodeWriter writer;
9041 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009042 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043 PyObject *errorHandler = NULL;
9044 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009045 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009046 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 PyErr_BadArgument();
9050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 if (PyUnicode_READY(input) == -1)
9054 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009055 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 kind = PyUnicode_KIND(input);
9057 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009059 if (size == 0)
9060 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009062 /* allocate enough for a simple 1:1 translation without
9063 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009064 _PyUnicodeWriter_Init(&writer);
9065 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067
Victor Stinner872b2912014-04-05 14:27:07 +02009068 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9069
Victor Stinner33798672016-03-01 21:59:58 +01009070 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009071 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009072 if (PyUnicode_IS_ASCII(input)) {
9073 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9074 if (res < 0) {
9075 _PyUnicodeWriter_Dealloc(&writer);
9076 return NULL;
9077 }
9078 if (res == 1)
9079 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009080 }
Victor Stinner33798672016-03-01 21:59:58 +01009081 else {
9082 i = 0;
9083 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009087 int translate;
9088 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9089 Py_ssize_t newpos;
9090 /* startpos for collecting untranslatable chars */
9091 Py_ssize_t collstart;
9092 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009093 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094
Victor Stinner1194ea02014-04-04 19:37:40 +02009095 ch = PyUnicode_READ(kind, data, i);
9096 translate = charmaptranslate_output(ch, mapping, &writer);
9097 if (translate < 0)
9098 goto onError;
9099
9100 if (translate != 0) {
9101 /* it worked => adjust input pointer */
9102 ++i;
9103 continue;
9104 }
9105
9106 /* untranslatable character */
9107 collstart = i;
9108 collend = i+1;
9109
9110 /* find all untranslatable characters */
9111 while (collend < size) {
9112 PyObject *x;
9113 ch = PyUnicode_READ(kind, data, collend);
9114 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009115 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009116 Py_XDECREF(x);
9117 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009118 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009119 ++collend;
9120 }
9121
9122 if (ignore) {
9123 i = collend;
9124 }
9125 else {
9126 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9127 reason, input, &exc,
9128 collstart, collend, &newpos);
9129 if (repunicode == NULL)
9130 goto onError;
9131 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009133 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009134 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009135 Py_DECREF(repunicode);
9136 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009137 }
9138 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009139 Py_XDECREF(exc);
9140 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009141 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009144 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009145 Py_XDECREF(exc);
9146 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 return NULL;
9148}
9149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150/* Deprecated. Use PyUnicode_Translate instead. */
9151PyObject *
9152PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9153 Py_ssize_t size,
9154 PyObject *mapping,
9155 const char *errors)
9156{
Christian Heimes5f520f42012-09-11 14:03:25 +02009157 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009158 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 if (!unicode)
9160 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009161 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9162 Py_DECREF(unicode);
9163 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164}
9165
Alexander Belopolsky40018472011-02-26 01:02:56 +00009166PyObject *
9167PyUnicode_Translate(PyObject *str,
9168 PyObject *mapping,
9169 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009171 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009172 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009173 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174}
Tim Petersced69f82003-09-16 20:30:58 +00009175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176PyObject *
9177_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9178{
9179 if (!PyUnicode_Check(unicode)) {
9180 PyErr_BadInternalCall();
9181 return NULL;
9182 }
9183 if (PyUnicode_READY(unicode) == -1)
9184 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009185 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 /* If the string is already ASCII, just return the same string */
9187 Py_INCREF(unicode);
9188 return unicode;
9189 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009190
9191 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9192 PyObject *result = PyUnicode_New(len, 127);
9193 if (result == NULL) {
9194 return NULL;
9195 }
9196
9197 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9198 int kind = PyUnicode_KIND(unicode);
9199 const void *data = PyUnicode_DATA(unicode);
9200 Py_ssize_t i;
9201 for (i = 0; i < len; ++i) {
9202 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9203 if (ch < 127) {
9204 out[i] = ch;
9205 }
9206 else if (Py_UNICODE_ISSPACE(ch)) {
9207 out[i] = ' ';
9208 }
9209 else {
9210 int decimal = Py_UNICODE_TODECIMAL(ch);
9211 if (decimal < 0) {
9212 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009213 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009214 _PyUnicode_LENGTH(result) = i + 1;
9215 break;
9216 }
9217 out[i] = '0' + decimal;
9218 }
9219 }
9220
INADA Naoki16dfca42018-07-14 12:06:43 +09009221 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009222 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223}
9224
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009225PyObject *
9226PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9227 Py_ssize_t length)
9228{
Victor Stinnerf0124502011-11-21 23:12:56 +01009229 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009230 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009231 Py_UCS4 maxchar;
9232 enum PyUnicode_Kind kind;
9233 void *data;
9234
Victor Stinner99d7ad02012-02-22 13:37:39 +01009235 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009236 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009237 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009238 if (ch > 127) {
9239 int decimal = Py_UNICODE_TODECIMAL(ch);
9240 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009241 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009242 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009243 }
9244 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009245
9246 /* Copy to a new string */
9247 decimal = PyUnicode_New(length, maxchar);
9248 if (decimal == NULL)
9249 return decimal;
9250 kind = PyUnicode_KIND(decimal);
9251 data = PyUnicode_DATA(decimal);
9252 /* Iterate over code points */
9253 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009254 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009255 if (ch > 127) {
9256 int decimal = Py_UNICODE_TODECIMAL(ch);
9257 if (decimal >= 0)
9258 ch = '0' + decimal;
9259 }
9260 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009262 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009263}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009264/* --- Decimal Encoder ---------------------------------------------------- */
9265
Alexander Belopolsky40018472011-02-26 01:02:56 +00009266int
9267PyUnicode_EncodeDecimal(Py_UNICODE *s,
9268 Py_ssize_t length,
9269 char *output,
9270 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009271{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009272 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009273 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009274 enum PyUnicode_Kind kind;
9275 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009276
9277 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009278 PyErr_BadArgument();
9279 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009280 }
9281
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009282 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009283 if (unicode == NULL)
9284 return -1;
9285
Victor Stinner42bf7752011-11-21 22:52:58 +01009286 kind = PyUnicode_KIND(unicode);
9287 data = PyUnicode_DATA(unicode);
9288
Victor Stinnerb84d7232011-11-22 01:50:07 +01009289 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009290 PyObject *exc;
9291 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009293 Py_ssize_t startpos;
9294
9295 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009296
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009298 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009299 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009302 decimal = Py_UNICODE_TODECIMAL(ch);
9303 if (decimal >= 0) {
9304 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009305 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 continue;
9307 }
9308 if (0 < ch && ch < 256) {
9309 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009310 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 continue;
9312 }
Victor Stinner6345be92011-11-25 20:09:01 +01009313
Victor Stinner42bf7752011-11-21 22:52:58 +01009314 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009315 exc = NULL;
9316 raise_encode_exception(&exc, "decimal", unicode,
9317 startpos, startpos+1,
9318 "invalid decimal Unicode string");
9319 Py_XDECREF(exc);
9320 Py_DECREF(unicode);
9321 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009322 }
9323 /* 0-terminate the output string */
9324 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009325 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009326 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009327}
9328
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329/* --- Helpers ------------------------------------------------------------ */
9330
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009331/* helper macro to fixup start/end slice values */
9332#define ADJUST_INDICES(start, end, len) \
9333 if (end > len) \
9334 end = len; \
9335 else if (end < 0) { \
9336 end += len; \
9337 if (end < 0) \
9338 end = 0; \
9339 } \
9340 if (start < 0) { \
9341 start += len; \
9342 if (start < 0) \
9343 start = 0; \
9344 }
9345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009347any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009349 Py_ssize_t end,
9350 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009352 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 void *buf1, *buf2;
9354 Py_ssize_t len1, len2, result;
9355
9356 kind1 = PyUnicode_KIND(s1);
9357 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009358 if (kind1 < kind2)
9359 return -1;
9360
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 len1 = PyUnicode_GET_LENGTH(s1);
9362 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009363 ADJUST_INDICES(start, end, len1);
9364 if (end - start < len2)
9365 return -1;
9366
9367 buf1 = PyUnicode_DATA(s1);
9368 buf2 = PyUnicode_DATA(s2);
9369 if (len2 == 1) {
9370 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9371 result = findchar((const char *)buf1 + kind1*start,
9372 kind1, end - start, ch, direction);
9373 if (result == -1)
9374 return -1;
9375 else
9376 return start + result;
9377 }
9378
9379 if (kind2 != kind1) {
9380 buf2 = _PyUnicode_AsKind(s2, kind1);
9381 if (!buf2)
9382 return -2;
9383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384
Victor Stinner794d5672011-10-10 03:21:36 +02009385 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009386 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009387 case PyUnicode_1BYTE_KIND:
9388 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9389 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9390 else
9391 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9392 break;
9393 case PyUnicode_2BYTE_KIND:
9394 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9395 break;
9396 case PyUnicode_4BYTE_KIND:
9397 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9398 break;
9399 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009400 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009401 }
9402 }
9403 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009404 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009405 case PyUnicode_1BYTE_KIND:
9406 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9407 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9408 else
9409 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9410 break;
9411 case PyUnicode_2BYTE_KIND:
9412 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9413 break;
9414 case PyUnicode_4BYTE_KIND:
9415 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9416 break;
9417 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009418 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 }
9421
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 PyMem_Free(buf2);
9424
9425 return result;
9426}
9427
Victor Stinner59423e32018-11-26 13:40:01 +01009428/* _PyUnicode_InsertThousandsGrouping() helper functions */
9429#include "stringlib/localeutil.h"
9430
9431/**
9432 * InsertThousandsGrouping:
9433 * @writer: Unicode writer.
9434 * @n_buffer: Number of characters in @buffer.
9435 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9436 * @d_pos: Start of digits string.
9437 * @n_digits: The number of digits in the string, in which we want
9438 * to put the grouping chars.
9439 * @min_width: The minimum width of the digits in the output string.
9440 * Output will be zero-padded on the left to fill.
9441 * @grouping: see definition in localeconv().
9442 * @thousands_sep: see definition in localeconv().
9443 *
9444 * There are 2 modes: counting and filling. If @writer is NULL,
9445 * we are in counting mode, else filling mode.
9446 * If counting, the required buffer size is returned.
9447 * If filling, we know the buffer will be large enough, so we don't
9448 * need to pass in the buffer size.
9449 * Inserts thousand grouping characters (as defined by grouping and
9450 * thousands_sep) into @writer.
9451 *
9452 * Return value: -1 on error, number of characters otherwise.
9453 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009455_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009456 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009457 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009458 PyObject *digits,
9459 Py_ssize_t d_pos,
9460 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009461 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009462 const char *grouping,
9463 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009464 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465{
Xtreak3f7983a2019-01-07 20:39:14 +05309466 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009467 if (writer) {
9468 assert(digits != NULL);
9469 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009470 }
9471 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009472 assert(digits == NULL);
9473 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009474 }
Victor Stinner59423e32018-11-26 13:40:01 +01009475 assert(0 <= d_pos);
9476 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009477 assert(grouping != NULL);
9478
9479 if (digits != NULL) {
9480 if (PyUnicode_READY(digits) == -1) {
9481 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009482 }
Victor Stinner59423e32018-11-26 13:40:01 +01009483 }
9484 if (PyUnicode_READY(thousands_sep) == -1) {
9485 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009486 }
9487
Victor Stinner59423e32018-11-26 13:40:01 +01009488 Py_ssize_t count = 0;
9489 Py_ssize_t n_zeros;
9490 int loop_broken = 0;
9491 int use_separator = 0; /* First time through, don't append the
9492 separator. They only go between
9493 groups. */
9494 Py_ssize_t buffer_pos;
9495 Py_ssize_t digits_pos;
9496 Py_ssize_t len;
9497 Py_ssize_t n_chars;
9498 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9499 be looked at */
9500 /* A generator that returns all of the grouping widths, until it
9501 returns 0. */
9502 GroupGenerator groupgen;
9503 GroupGenerator_init(&groupgen, grouping);
9504 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9505
9506 /* if digits are not grouped, thousands separator
9507 should be an empty string */
9508 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9509
9510 digits_pos = d_pos + n_digits;
9511 if (writer) {
9512 buffer_pos = writer->pos + n_buffer;
9513 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9514 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 }
Victor Stinner59423e32018-11-26 13:40:01 +01009516 else {
9517 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009518 }
Victor Stinner59423e32018-11-26 13:40:01 +01009519
9520 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009521 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009522 }
Victor Stinner59423e32018-11-26 13:40:01 +01009523
9524 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9525 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9526 n_zeros = Py_MAX(0, len - remaining);
9527 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9528
9529 /* Use n_zero zero's and n_chars chars */
9530
9531 /* Count only, don't do anything. */
9532 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9533
9534 /* Copy into the writer. */
9535 InsertThousandsGrouping_fill(writer, &buffer_pos,
9536 digits, &digits_pos,
9537 n_chars, n_zeros,
9538 use_separator ? thousands_sep : NULL,
9539 thousands_sep_len, maxchar);
9540
9541 /* Use a separator next time. */
9542 use_separator = 1;
9543
9544 remaining -= n_chars;
9545 min_width -= len;
9546
9547 if (remaining <= 0 && min_width <= 0) {
9548 loop_broken = 1;
9549 break;
9550 }
9551 min_width -= thousands_sep_len;
9552 }
9553 if (!loop_broken) {
9554 /* We left the loop without using a break statement. */
9555
9556 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9557 n_zeros = Py_MAX(0, len - remaining);
9558 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9559
9560 /* Use n_zero zero's and n_chars chars */
9561 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9562
9563 /* Copy into the writer. */
9564 InsertThousandsGrouping_fill(writer, &buffer_pos,
9565 digits, &digits_pos,
9566 n_chars, n_zeros,
9567 use_separator ? thousands_sep : NULL,
9568 thousands_sep_len, maxchar);
9569 }
9570 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571}
9572
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574Py_ssize_t
9575PyUnicode_Count(PyObject *str,
9576 PyObject *substr,
9577 Py_ssize_t start,
9578 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009580 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009581 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 void *buf1 = NULL, *buf2 = NULL;
9583 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009584
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 kind1 = PyUnicode_KIND(str);
9589 kind2 = PyUnicode_KIND(substr);
9590 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009591 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 len1 = PyUnicode_GET_LENGTH(str);
9594 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009596 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009597 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009598
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009599 buf1 = PyUnicode_DATA(str);
9600 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009601 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009602 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009603 if (!buf2)
9604 goto onError;
9605 }
9606
9607 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009609 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009610 result = asciilib_count(
9611 ((Py_UCS1*)buf1) + start, end - start,
9612 buf2, len2, PY_SSIZE_T_MAX
9613 );
9614 else
9615 result = ucs1lib_count(
9616 ((Py_UCS1*)buf1) + start, end - start,
9617 buf2, len2, PY_SSIZE_T_MAX
9618 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 break;
9620 case PyUnicode_2BYTE_KIND:
9621 result = ucs2lib_count(
9622 ((Py_UCS2*)buf1) + start, end - start,
9623 buf2, len2, PY_SSIZE_T_MAX
9624 );
9625 break;
9626 case PyUnicode_4BYTE_KIND:
9627 result = ucs4lib_count(
9628 ((Py_UCS4*)buf1) + start, end - start,
9629 buf2, len2, PY_SSIZE_T_MAX
9630 );
9631 break;
9632 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009633 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009635
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009636 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 PyMem_Free(buf2);
9638
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009641 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 PyMem_Free(buf2);
9643 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644}
9645
Alexander Belopolsky40018472011-02-26 01:02:56 +00009646Py_ssize_t
9647PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009648 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009649 Py_ssize_t start,
9650 Py_ssize_t end,
9651 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009653 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009655
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009656 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009657}
9658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659Py_ssize_t
9660PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9661 Py_ssize_t start, Py_ssize_t end,
9662 int direction)
9663{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009665 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 if (PyUnicode_READY(str) == -1)
9667 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009668 len = PyUnicode_GET_LENGTH(str);
9669 ADJUST_INDICES(start, end, len);
9670 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009671 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009673 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9674 kind, end-start, ch, direction);
9675 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009677 else
9678 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679}
9680
Alexander Belopolsky40018472011-02-26 01:02:56 +00009681static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009682tailmatch(PyObject *self,
9683 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009684 Py_ssize_t start,
9685 Py_ssize_t end,
9686 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688 int kind_self;
9689 int kind_sub;
9690 void *data_self;
9691 void *data_sub;
9692 Py_ssize_t offset;
9693 Py_ssize_t i;
9694 Py_ssize_t end_sub;
9695
9696 if (PyUnicode_READY(self) == -1 ||
9697 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009698 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9701 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009703 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009705 if (PyUnicode_GET_LENGTH(substring) == 0)
9706 return 1;
9707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 kind_self = PyUnicode_KIND(self);
9709 data_self = PyUnicode_DATA(self);
9710 kind_sub = PyUnicode_KIND(substring);
9711 data_sub = PyUnicode_DATA(substring);
9712 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9713
9714 if (direction > 0)
9715 offset = end;
9716 else
9717 offset = start;
9718
9719 if (PyUnicode_READ(kind_self, data_self, offset) ==
9720 PyUnicode_READ(kind_sub, data_sub, 0) &&
9721 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9722 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9723 /* If both are of the same kind, memcmp is sufficient */
9724 if (kind_self == kind_sub) {
9725 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009726 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 data_sub,
9728 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009729 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009731 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 else {
9733 /* We do not need to compare 0 and len(substring)-1 because
9734 the if statement above ensured already that they are equal
9735 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 for (i = 1; i < end_sub; ++i) {
9737 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9738 PyUnicode_READ(kind_sub, data_sub, i))
9739 return 0;
9740 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009741 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743 }
9744
9745 return 0;
9746}
9747
Alexander Belopolsky40018472011-02-26 01:02:56 +00009748Py_ssize_t
9749PyUnicode_Tailmatch(PyObject *str,
9750 PyObject *substr,
9751 Py_ssize_t start,
9752 Py_ssize_t end,
9753 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009755 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009756 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009757
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009758 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759}
9760
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761static PyObject *
9762ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009764 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9765 char *resdata, *data = PyUnicode_DATA(self);
9766 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009767
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768 res = PyUnicode_New(len, 127);
9769 if (res == NULL)
9770 return NULL;
9771 resdata = PyUnicode_DATA(res);
9772 if (lower)
9773 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009775 _Py_bytes_upper(resdata, data, len);
9776 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777}
9778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009780handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009782 Py_ssize_t j;
9783 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009784 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009785 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009786
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009787 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9788
9789 where ! is a negation and \p{xxx} is a character with property xxx.
9790 */
9791 for (j = i - 1; j >= 0; j--) {
9792 c = PyUnicode_READ(kind, data, j);
9793 if (!_PyUnicode_IsCaseIgnorable(c))
9794 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009796 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9797 if (final_sigma) {
9798 for (j = i + 1; j < length; j++) {
9799 c = PyUnicode_READ(kind, data, j);
9800 if (!_PyUnicode_IsCaseIgnorable(c))
9801 break;
9802 }
9803 final_sigma = j == length || !_PyUnicode_IsCased(c);
9804 }
9805 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806}
9807
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808static int
9809lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9810 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812 /* Obscure special case. */
9813 if (c == 0x3A3) {
9814 mapped[0] = handle_capital_sigma(kind, data, length, i);
9815 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009817 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818}
9819
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820static Py_ssize_t
9821do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009823 Py_ssize_t i, k = 0;
9824 int n_res, j;
9825 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009826
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009827 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009828 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009831 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833 for (i = 1; i < length; i++) {
9834 c = PyUnicode_READ(kind, data, i);
9835 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9836 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009837 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009838 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009839 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009840 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009841 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842}
9843
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009844static Py_ssize_t
9845do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9846 Py_ssize_t i, k = 0;
9847
9848 for (i = 0; i < length; i++) {
9849 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9850 int n_res, j;
9851 if (Py_UNICODE_ISUPPER(c)) {
9852 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9853 }
9854 else if (Py_UNICODE_ISLOWER(c)) {
9855 n_res = _PyUnicode_ToUpperFull(c, mapped);
9856 }
9857 else {
9858 n_res = 1;
9859 mapped[0] = c;
9860 }
9861 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009862 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009863 res[k++] = mapped[j];
9864 }
9865 }
9866 return k;
9867}
9868
9869static Py_ssize_t
9870do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9871 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009873 Py_ssize_t i, k = 0;
9874
9875 for (i = 0; i < length; i++) {
9876 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9877 int n_res, j;
9878 if (lower)
9879 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9880 else
9881 n_res = _PyUnicode_ToUpperFull(c, mapped);
9882 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009883 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009884 res[k++] = mapped[j];
9885 }
9886 }
9887 return k;
9888}
9889
9890static Py_ssize_t
9891do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9892{
9893 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9894}
9895
9896static Py_ssize_t
9897do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9898{
9899 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9900}
9901
Benjamin Petersone51757f2012-01-12 21:10:29 -05009902static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009903do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9904{
9905 Py_ssize_t i, k = 0;
9906
9907 for (i = 0; i < length; i++) {
9908 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9909 Py_UCS4 mapped[3];
9910 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9911 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009912 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009913 res[k++] = mapped[j];
9914 }
9915 }
9916 return k;
9917}
9918
9919static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009920do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9921{
9922 Py_ssize_t i, k = 0;
9923 int previous_is_cased;
9924
9925 previous_is_cased = 0;
9926 for (i = 0; i < length; i++) {
9927 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9928 Py_UCS4 mapped[3];
9929 int n_res, j;
9930
9931 if (previous_is_cased)
9932 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9933 else
9934 n_res = _PyUnicode_ToTitleFull(c, mapped);
9935
9936 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009937 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009938 res[k++] = mapped[j];
9939 }
9940
9941 previous_is_cased = _PyUnicode_IsCased(c);
9942 }
9943 return k;
9944}
9945
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009946static PyObject *
9947case_operation(PyObject *self,
9948 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9949{
9950 PyObject *res = NULL;
9951 Py_ssize_t length, newlength = 0;
9952 int kind, outkind;
9953 void *data, *outdata;
9954 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9955
Benjamin Petersoneea48462012-01-16 14:28:50 -05009956 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009957
9958 kind = PyUnicode_KIND(self);
9959 data = PyUnicode_DATA(self);
9960 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009961 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009962 PyErr_SetString(PyExc_OverflowError, "string is too long");
9963 return NULL;
9964 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009965 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009966 if (tmp == NULL)
9967 return PyErr_NoMemory();
9968 newlength = perform(kind, data, length, tmp, &maxchar);
9969 res = PyUnicode_New(newlength, maxchar);
9970 if (res == NULL)
9971 goto leave;
9972 tmpend = tmp + newlength;
9973 outdata = PyUnicode_DATA(res);
9974 outkind = PyUnicode_KIND(res);
9975 switch (outkind) {
9976 case PyUnicode_1BYTE_KIND:
9977 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9978 break;
9979 case PyUnicode_2BYTE_KIND:
9980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9981 break;
9982 case PyUnicode_4BYTE_KIND:
9983 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9984 break;
9985 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009986 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009987 }
9988 leave:
9989 PyMem_FREE(tmp);
9990 return res;
9991}
9992
Tim Peters8ce9f162004-08-27 01:49:32 +00009993PyObject *
9994PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009995{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009996 PyObject *res;
9997 PyObject *fseq;
9998 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010001 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010002 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010003 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010004 }
10005
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 /* NOTE: the following code can't call back into Python code,
10007 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010008 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010010 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010011 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010012 res = _PyUnicode_JoinArray(separator, items, seqlen);
10013 Py_DECREF(fseq);
10014 return res;
10015}
10016
10017PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010018_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010019{
10020 PyObject *res = NULL; /* the result */
10021 PyObject *sep = NULL;
10022 Py_ssize_t seplen;
10023 PyObject *item;
10024 Py_ssize_t sz, i, res_offset;
10025 Py_UCS4 maxchar;
10026 Py_UCS4 item_maxchar;
10027 int use_memcpy;
10028 unsigned char *res_data = NULL, *sep_data = NULL;
10029 PyObject *last_obj;
10030 unsigned int kind = 0;
10031
Tim Peters05eba1f2004-08-27 21:32:02 +000010032 /* If empty sequence, return u"". */
10033 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010034 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010035 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010036
Tim Peters05eba1f2004-08-27 21:32:02 +000010037 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010039 if (seqlen == 1) {
10040 if (PyUnicode_CheckExact(items[0])) {
10041 res = items[0];
10042 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010043 return res;
10044 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010046 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010047 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010048 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010049 /* Set up sep and seplen */
10050 if (separator == NULL) {
10051 /* fall back to a blank space separator */
10052 sep = PyUnicode_FromOrdinal(' ');
10053 if (!sep)
10054 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010055 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010056 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010057 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010058 else {
10059 if (!PyUnicode_Check(separator)) {
10060 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010061 "separator: expected str instance,"
10062 " %.80s found",
10063 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010064 goto onError;
10065 }
10066 if (PyUnicode_READY(separator))
10067 goto onError;
10068 sep = separator;
10069 seplen = PyUnicode_GET_LENGTH(separator);
10070 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10071 /* inc refcount to keep this code path symmetric with the
10072 above case of a blank separator */
10073 Py_INCREF(sep);
10074 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010076 }
10077
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010078 /* There are at least two things to join, or else we have a subclass
10079 * of str in the sequence.
10080 * Do a pre-pass to figure out the total amount of space we'll
10081 * need (sz), and see whether all argument are strings.
10082 */
10083 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010084#ifdef Py_DEBUG
10085 use_memcpy = 0;
10086#else
10087 use_memcpy = 1;
10088#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010089 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010090 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010091 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010092 if (!PyUnicode_Check(item)) {
10093 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010094 "sequence item %zd: expected str instance,"
10095 " %.80s found",
10096 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 goto onError;
10098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (PyUnicode_READY(item) == -1)
10100 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010101 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010103 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010104 if (i != 0) {
10105 add_sz += seplen;
10106 }
10107 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010108 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010110 goto onError;
10111 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010112 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010113 if (use_memcpy && last_obj != NULL) {
10114 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10115 use_memcpy = 0;
10116 }
10117 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010118 }
Tim Petersced69f82003-09-16 20:30:58 +000010119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010121 if (res == NULL)
10122 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010123
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010124 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010125#ifdef Py_DEBUG
10126 use_memcpy = 0;
10127#else
10128 if (use_memcpy) {
10129 res_data = PyUnicode_1BYTE_DATA(res);
10130 kind = PyUnicode_KIND(res);
10131 if (seplen != 0)
10132 sep_data = PyUnicode_1BYTE_DATA(sep);
10133 }
10134#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010135 if (use_memcpy) {
10136 for (i = 0; i < seqlen; ++i) {
10137 Py_ssize_t itemlen;
10138 item = items[i];
10139
10140 /* Copy item, and maybe the separator. */
10141 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010142 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010143 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010144 kind * seplen);
10145 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010146 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010147
10148 itemlen = PyUnicode_GET_LENGTH(item);
10149 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010150 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010151 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010152 kind * itemlen);
10153 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010154 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010155 }
10156 assert(res_data == PyUnicode_1BYTE_DATA(res)
10157 + kind * PyUnicode_GET_LENGTH(res));
10158 }
10159 else {
10160 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10161 Py_ssize_t itemlen;
10162 item = items[i];
10163
10164 /* Copy item, and maybe the separator. */
10165 if (i && seplen != 0) {
10166 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10167 res_offset += seplen;
10168 }
10169
10170 itemlen = PyUnicode_GET_LENGTH(item);
10171 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010172 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010173 res_offset += itemlen;
10174 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010175 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010176 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010177 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010180 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010185 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186 return NULL;
10187}
10188
Victor Stinnerd3f08822012-05-29 12:57:52 +020010189void
10190_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10191 Py_UCS4 fill_char)
10192{
10193 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010194 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010195 assert(PyUnicode_IS_READY(unicode));
10196 assert(unicode_modifiable(unicode));
10197 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10198 assert(start >= 0);
10199 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010200 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010201}
10202
Victor Stinner3fe55312012-01-04 00:33:50 +010010203Py_ssize_t
10204PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10205 Py_UCS4 fill_char)
10206{
10207 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010208
10209 if (!PyUnicode_Check(unicode)) {
10210 PyErr_BadInternalCall();
10211 return -1;
10212 }
10213 if (PyUnicode_READY(unicode) == -1)
10214 return -1;
10215 if (unicode_check_modifiable(unicode))
10216 return -1;
10217
Victor Stinnerd3f08822012-05-29 12:57:52 +020010218 if (start < 0) {
10219 PyErr_SetString(PyExc_IndexError, "string index out of range");
10220 return -1;
10221 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010222 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10223 PyErr_SetString(PyExc_ValueError,
10224 "fill character is bigger than "
10225 "the string maximum character");
10226 return -1;
10227 }
10228
10229 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10230 length = Py_MIN(maxlen, length);
10231 if (length <= 0)
10232 return 0;
10233
Victor Stinnerd3f08822012-05-29 12:57:52 +020010234 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010235 return length;
10236}
10237
Victor Stinner9310abb2011-10-05 00:59:23 +020010238static PyObject *
10239pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010240 Py_ssize_t left,
10241 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 PyObject *u;
10245 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010246 int kind;
10247 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
10249 if (left < 0)
10250 left = 0;
10251 if (right < 0)
10252 right = 0;
10253
Victor Stinnerc4b49542011-12-11 22:44:26 +010010254 if (left == 0 && right == 0)
10255 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10258 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010259 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10260 return NULL;
10261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010263 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010265 if (!u)
10266 return NULL;
10267
10268 kind = PyUnicode_KIND(u);
10269 data = PyUnicode_DATA(u);
10270 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010271 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010272 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010273 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010274 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010275 assert(_PyUnicode_CheckConsistency(u, 1));
10276 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277}
10278
Alexander Belopolsky40018472011-02-26 01:02:56 +000010279PyObject *
10280PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010284 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286
Benjamin Petersonead6b532011-12-20 17:23:42 -060010287 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010289 if (PyUnicode_IS_ASCII(string))
10290 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010291 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010292 PyUnicode_GET_LENGTH(string), keepends);
10293 else
10294 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 break;
10298 case PyUnicode_2BYTE_KIND:
10299 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010300 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 PyUnicode_GET_LENGTH(string), keepends);
10302 break;
10303 case PyUnicode_4BYTE_KIND:
10304 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 PyUnicode_GET_LENGTH(string), keepends);
10307 break;
10308 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010309 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010311 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312}
10313
Alexander Belopolsky40018472011-02-26 01:02:56 +000010314static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010315split(PyObject *self,
10316 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010317 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010319 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 void *buf1, *buf2;
10321 Py_ssize_t len1, len2;
10322 PyObject* out;
10323
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010325 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 if (PyUnicode_READY(self) == -1)
10328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010331 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 if (PyUnicode_IS_ASCII(self))
10334 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010335 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010336 PyUnicode_GET_LENGTH(self), maxcount
10337 );
10338 else
10339 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010340 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 PyUnicode_GET_LENGTH(self), maxcount
10342 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 case PyUnicode_2BYTE_KIND:
10344 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010345 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 PyUnicode_GET_LENGTH(self), maxcount
10347 );
10348 case PyUnicode_4BYTE_KIND:
10349 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 PyUnicode_GET_LENGTH(self), maxcount
10352 );
10353 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010354 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 }
10356
10357 if (PyUnicode_READY(substring) == -1)
10358 return NULL;
10359
10360 kind1 = PyUnicode_KIND(self);
10361 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 len1 = PyUnicode_GET_LENGTH(self);
10363 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010364 if (kind1 < kind2 || len1 < len2) {
10365 out = PyList_New(1);
10366 if (out == NULL)
10367 return NULL;
10368 Py_INCREF(self);
10369 PyList_SET_ITEM(out, 0, self);
10370 return out;
10371 }
10372 buf1 = PyUnicode_DATA(self);
10373 buf2 = PyUnicode_DATA(substring);
10374 if (kind2 != kind1) {
10375 buf2 = _PyUnicode_AsKind(substring, kind1);
10376 if (!buf2)
10377 return NULL;
10378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010380 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10383 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010384 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010385 else
10386 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010387 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 break;
10389 case PyUnicode_2BYTE_KIND:
10390 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010391 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 break;
10393 case PyUnicode_4BYTE_KIND:
10394 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 break;
10397 default:
10398 out = NULL;
10399 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010400 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 PyMem_Free(buf2);
10402 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403}
10404
Alexander Belopolsky40018472011-02-26 01:02:56 +000010405static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010406rsplit(PyObject *self,
10407 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010408 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010409{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010410 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 void *buf1, *buf2;
10412 Py_ssize_t len1, len2;
10413 PyObject* out;
10414
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010415 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010416 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 if (PyUnicode_READY(self) == -1)
10419 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010422 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010424 if (PyUnicode_IS_ASCII(self))
10425 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010426 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 PyUnicode_GET_LENGTH(self), maxcount
10428 );
10429 else
10430 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010431 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432 PyUnicode_GET_LENGTH(self), maxcount
10433 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 case PyUnicode_2BYTE_KIND:
10435 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 PyUnicode_GET_LENGTH(self), maxcount
10438 );
10439 case PyUnicode_4BYTE_KIND:
10440 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010441 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 PyUnicode_GET_LENGTH(self), maxcount
10443 );
10444 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010445 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 }
10447
10448 if (PyUnicode_READY(substring) == -1)
10449 return NULL;
10450
10451 kind1 = PyUnicode_KIND(self);
10452 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 len1 = PyUnicode_GET_LENGTH(self);
10454 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010455 if (kind1 < kind2 || len1 < len2) {
10456 out = PyList_New(1);
10457 if (out == NULL)
10458 return NULL;
10459 Py_INCREF(self);
10460 PyList_SET_ITEM(out, 0, self);
10461 return out;
10462 }
10463 buf1 = PyUnicode_DATA(self);
10464 buf2 = PyUnicode_DATA(substring);
10465 if (kind2 != kind1) {
10466 buf2 = _PyUnicode_AsKind(substring, kind1);
10467 if (!buf2)
10468 return NULL;
10469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010471 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010473 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10474 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010475 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010476 else
10477 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010478 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 break;
10480 case PyUnicode_2BYTE_KIND:
10481 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010482 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 break;
10484 case PyUnicode_4BYTE_KIND:
10485 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010486 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 break;
10488 default:
10489 out = NULL;
10490 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010491 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 PyMem_Free(buf2);
10493 return out;
10494}
10495
10496static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010497anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10498 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010500 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010502 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10503 return asciilib_find(buf1, len1, buf2, len2, offset);
10504 else
10505 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 case PyUnicode_2BYTE_KIND:
10507 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10508 case PyUnicode_4BYTE_KIND:
10509 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10510 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010511 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512}
10513
10514static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010515anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10516 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010518 switch (kind) {
10519 case PyUnicode_1BYTE_KIND:
10520 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10521 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10522 else
10523 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10524 case PyUnicode_2BYTE_KIND:
10525 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10526 case PyUnicode_4BYTE_KIND:
10527 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10528 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010529 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010530}
10531
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010532static void
10533replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10534 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10535{
10536 int kind = PyUnicode_KIND(u);
10537 void *data = PyUnicode_DATA(u);
10538 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10539 if (kind == PyUnicode_1BYTE_KIND) {
10540 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10541 (Py_UCS1 *)data + len,
10542 u1, u2, maxcount);
10543 }
10544 else if (kind == PyUnicode_2BYTE_KIND) {
10545 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10546 (Py_UCS2 *)data + len,
10547 u1, u2, maxcount);
10548 }
10549 else {
10550 assert(kind == PyUnicode_4BYTE_KIND);
10551 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10552 (Py_UCS4 *)data + len,
10553 u1, u2, maxcount);
10554 }
10555}
10556
Alexander Belopolsky40018472011-02-26 01:02:56 +000010557static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558replace(PyObject *self, PyObject *str1,
10559 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 PyObject *u;
10562 char *sbuf = PyUnicode_DATA(self);
10563 char *buf1 = PyUnicode_DATA(str1);
10564 char *buf2 = PyUnicode_DATA(str2);
10565 int srelease = 0, release1 = 0, release2 = 0;
10566 int skind = PyUnicode_KIND(self);
10567 int kind1 = PyUnicode_KIND(str1);
10568 int kind2 = PyUnicode_KIND(str2);
10569 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10570 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10571 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010573 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574
10575 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010578 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579
Victor Stinner59de0ee2011-10-07 10:01:28 +020010580 if (str1 == str2)
10581 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582
Victor Stinner49a0a212011-10-12 23:46:10 +020010583 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010584 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10585 if (maxchar < maxchar_str1)
10586 /* substring too wide to be present */
10587 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10589 /* Replacing str1 with str2 may cause a maxchar reduction in the
10590 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010591 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010592 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010597 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010599 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010600 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010601 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010602
Victor Stinner69ed0f42013-04-09 21:48:24 +020010603 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010604 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010605 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010607 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010609 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010611
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010612 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10613 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010614 }
10615 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 int rkind = skind;
10617 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010618 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (kind1 < rkind) {
10621 /* widen substring */
10622 buf1 = _PyUnicode_AsKind(str1, rkind);
10623 if (!buf1) goto error;
10624 release1 = 1;
10625 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010626 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010627 if (i < 0)
10628 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 if (rkind > kind2) {
10630 /* widen replacement */
10631 buf2 = _PyUnicode_AsKind(str2, rkind);
10632 if (!buf2) goto error;
10633 release2 = 1;
10634 }
10635 else if (rkind < kind2) {
10636 /* widen self and buf1 */
10637 rkind = kind2;
10638 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010639 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 sbuf = _PyUnicode_AsKind(self, rkind);
10641 if (!sbuf) goto error;
10642 srelease = 1;
10643 buf1 = _PyUnicode_AsKind(str1, rkind);
10644 if (!buf1) goto error;
10645 release1 = 1;
10646 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 u = PyUnicode_New(slen, maxchar);
10648 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010650 assert(PyUnicode_KIND(u) == rkind);
10651 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010652
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010653 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010654 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010659
10660 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010663 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010664 if (i == -1)
10665 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010672 }
10673 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010675 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 int rkind = skind;
10677 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010680 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 buf1 = _PyUnicode_AsKind(str1, rkind);
10682 if (!buf1) goto error;
10683 release1 = 1;
10684 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010685 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 if (n == 0)
10687 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010689 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 buf2 = _PyUnicode_AsKind(str2, rkind);
10691 if (!buf2) goto error;
10692 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010695 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 rkind = kind2;
10697 sbuf = _PyUnicode_AsKind(self, rkind);
10698 if (!sbuf) goto error;
10699 srelease = 1;
10700 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010701 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 buf1 = _PyUnicode_AsKind(str1, rkind);
10703 if (!buf1) goto error;
10704 release1 = 1;
10705 }
10706 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10707 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010708 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 PyErr_SetString(PyExc_OverflowError,
10710 "replace string is too long");
10711 goto error;
10712 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010713 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010714 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010715 _Py_INCREF_UNICODE_EMPTY();
10716 if (!unicode_empty)
10717 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010718 u = unicode_empty;
10719 goto done;
10720 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010721 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 PyErr_SetString(PyExc_OverflowError,
10723 "replace string is too long");
10724 goto error;
10725 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010726 u = PyUnicode_New(new_size, maxchar);
10727 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010729 assert(PyUnicode_KIND(u) == rkind);
10730 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 ires = i = 0;
10732 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010733 while (n-- > 0) {
10734 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010735 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010736 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010737 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010738 if (j == -1)
10739 break;
10740 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010741 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010742 memcpy(res + rkind * ires,
10743 sbuf + rkind * i,
10744 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010746 }
10747 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010749 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010751 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010757 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010758 memcpy(res + rkind * ires,
10759 sbuf + rkind * i,
10760 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010761 }
10762 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010763 /* interleave */
10764 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010767 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010769 if (--n <= 0)
10770 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010771 memcpy(res + rkind * ires,
10772 sbuf + rkind * i,
10773 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 ires++;
10775 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010776 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010777 memcpy(res + rkind * ires,
10778 sbuf + rkind * i,
10779 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010780 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010781 }
10782
10783 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010784 unicode_adjust_maxchar(&u);
10785 if (u == NULL)
10786 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010788
10789 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (srelease)
10791 PyMem_FREE(sbuf);
10792 if (release1)
10793 PyMem_FREE(buf1);
10794 if (release2)
10795 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010796 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010798
Benjamin Peterson29060642009-01-31 22:14:21 +000010799 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010800 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (srelease)
10802 PyMem_FREE(sbuf);
10803 if (release1)
10804 PyMem_FREE(buf1);
10805 if (release2)
10806 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010807 return unicode_result_unchanged(self);
10808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 error:
10810 if (srelease && sbuf)
10811 PyMem_FREE(sbuf);
10812 if (release1 && buf1)
10813 PyMem_FREE(buf1);
10814 if (release2 && buf2)
10815 PyMem_FREE(buf2);
10816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817}
10818
10819/* --- Unicode Object Methods --------------------------------------------- */
10820
INADA Naoki3ae20562017-01-16 20:41:20 +090010821/*[clinic input]
10822str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
INADA Naoki3ae20562017-01-16 20:41:20 +090010824Return a version of the string where each word is titlecased.
10825
10826More specifically, words start with uppercased characters and all remaining
10827cased characters have lower case.
10828[clinic start generated code]*/
10829
10830static PyObject *
10831unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010832/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010834 if (PyUnicode_READY(self) == -1)
10835 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010836 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837}
10838
INADA Naoki3ae20562017-01-16 20:41:20 +090010839/*[clinic input]
10840str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
INADA Naoki3ae20562017-01-16 20:41:20 +090010842Return a capitalized version of the string.
10843
10844More specifically, make the first character have upper case and the rest lower
10845case.
10846[clinic start generated code]*/
10847
10848static PyObject *
10849unicode_capitalize_impl(PyObject *self)
10850/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010852 if (PyUnicode_READY(self) == -1)
10853 return NULL;
10854 if (PyUnicode_GET_LENGTH(self) == 0)
10855 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010856 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857}
10858
INADA Naoki3ae20562017-01-16 20:41:20 +090010859/*[clinic input]
10860str.casefold as unicode_casefold
10861
10862Return a version of the string suitable for caseless comparisons.
10863[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010864
10865static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010866unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010867/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010868{
10869 if (PyUnicode_READY(self) == -1)
10870 return NULL;
10871 if (PyUnicode_IS_ASCII(self))
10872 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010873 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010874}
10875
10876
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010877/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010878
10879static int
10880convert_uc(PyObject *obj, void *addr)
10881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010882 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010883
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010884 if (!PyUnicode_Check(obj)) {
10885 PyErr_Format(PyExc_TypeError,
10886 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010887 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888 return 0;
10889 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010890 if (PyUnicode_READY(obj) < 0)
10891 return 0;
10892 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010894 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010895 return 0;
10896 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010897 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010898 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010899}
10900
INADA Naoki3ae20562017-01-16 20:41:20 +090010901/*[clinic input]
10902str.center as unicode_center
10903
10904 width: Py_ssize_t
10905 fillchar: Py_UCS4 = ' '
10906 /
10907
10908Return a centered string of length width.
10909
10910Padding is done using the specified fill character (default is a space).
10911[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
10913static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010914unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10915/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010917 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010918
Benjamin Petersonbac79492012-01-14 13:34:47 -050010919 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920 return NULL;
10921
Victor Stinnerc4b49542011-12-11 22:44:26 +010010922 if (PyUnicode_GET_LENGTH(self) >= width)
10923 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
Victor Stinnerc4b49542011-12-11 22:44:26 +010010925 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 left = marg / 2 + (marg & width & 1);
10927
Victor Stinner9310abb2011-10-05 00:59:23 +020010928 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929}
10930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931/* This function assumes that str1 and str2 are readied by the caller. */
10932
Marc-André Lemburge5034372000-08-08 08:04:29 +000010933static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010934unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010935{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010936#define COMPARE(TYPE1, TYPE2) \
10937 do { \
10938 TYPE1* p1 = (TYPE1 *)data1; \
10939 TYPE2* p2 = (TYPE2 *)data2; \
10940 TYPE1* end = p1 + len; \
10941 Py_UCS4 c1, c2; \
10942 for (; p1 != end; p1++, p2++) { \
10943 c1 = *p1; \
10944 c2 = *p2; \
10945 if (c1 != c2) \
10946 return (c1 < c2) ? -1 : 1; \
10947 } \
10948 } \
10949 while (0)
10950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 int kind1, kind2;
10952 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010953 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 kind1 = PyUnicode_KIND(str1);
10956 kind2 = PyUnicode_KIND(str2);
10957 data1 = PyUnicode_DATA(str1);
10958 data2 = PyUnicode_DATA(str2);
10959 len1 = PyUnicode_GET_LENGTH(str1);
10960 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010961 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010962
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010963 switch(kind1) {
10964 case PyUnicode_1BYTE_KIND:
10965 {
10966 switch(kind2) {
10967 case PyUnicode_1BYTE_KIND:
10968 {
10969 int cmp = memcmp(data1, data2, len);
10970 /* normalize result of memcmp() into the range [-1; 1] */
10971 if (cmp < 0)
10972 return -1;
10973 if (cmp > 0)
10974 return 1;
10975 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010976 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010977 case PyUnicode_2BYTE_KIND:
10978 COMPARE(Py_UCS1, Py_UCS2);
10979 break;
10980 case PyUnicode_4BYTE_KIND:
10981 COMPARE(Py_UCS1, Py_UCS4);
10982 break;
10983 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010984 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010985 }
10986 break;
10987 }
10988 case PyUnicode_2BYTE_KIND:
10989 {
10990 switch(kind2) {
10991 case PyUnicode_1BYTE_KIND:
10992 COMPARE(Py_UCS2, Py_UCS1);
10993 break;
10994 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010995 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010996 COMPARE(Py_UCS2, Py_UCS2);
10997 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010998 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010999 case PyUnicode_4BYTE_KIND:
11000 COMPARE(Py_UCS2, Py_UCS4);
11001 break;
11002 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011003 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011004 }
11005 break;
11006 }
11007 case PyUnicode_4BYTE_KIND:
11008 {
11009 switch(kind2) {
11010 case PyUnicode_1BYTE_KIND:
11011 COMPARE(Py_UCS4, Py_UCS1);
11012 break;
11013 case PyUnicode_2BYTE_KIND:
11014 COMPARE(Py_UCS4, Py_UCS2);
11015 break;
11016 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011017 {
11018#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11019 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11020 /* normalize result of wmemcmp() into the range [-1; 1] */
11021 if (cmp < 0)
11022 return -1;
11023 if (cmp > 0)
11024 return 1;
11025#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011026 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011027#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011028 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011029 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011030 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011031 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011032 }
11033 break;
11034 }
11035 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011036 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011037 }
11038
Victor Stinner770e19e2012-10-04 22:59:45 +020011039 if (len1 == len2)
11040 return 0;
11041 if (len1 < len2)
11042 return -1;
11043 else
11044 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011045
11046#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011047}
11048
Benjamin Peterson621b4302016-09-09 13:54:34 -070011049static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011050unicode_compare_eq(PyObject *str1, PyObject *str2)
11051{
11052 int kind;
11053 void *data1, *data2;
11054 Py_ssize_t len;
11055 int cmp;
11056
Victor Stinnere5567ad2012-10-23 02:48:49 +020011057 len = PyUnicode_GET_LENGTH(str1);
11058 if (PyUnicode_GET_LENGTH(str2) != len)
11059 return 0;
11060 kind = PyUnicode_KIND(str1);
11061 if (PyUnicode_KIND(str2) != kind)
11062 return 0;
11063 data1 = PyUnicode_DATA(str1);
11064 data2 = PyUnicode_DATA(str2);
11065
11066 cmp = memcmp(data1, data2, len * kind);
11067 return (cmp == 0);
11068}
11069
11070
Alexander Belopolsky40018472011-02-26 01:02:56 +000011071int
11072PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11075 if (PyUnicode_READY(left) == -1 ||
11076 PyUnicode_READY(right) == -1)
11077 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011078
11079 /* a string is equal to itself */
11080 if (left == right)
11081 return 0;
11082
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011083 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011085 PyErr_Format(PyExc_TypeError,
11086 "Can't compare %.100s and %.100s",
11087 left->ob_type->tp_name,
11088 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089 return -1;
11090}
11091
Martin v. Löwis5b222132007-06-10 09:51:05 +000011092int
11093PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 Py_ssize_t i;
11096 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011098 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099
Victor Stinner910337b2011-10-03 03:20:16 +020011100 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011101 if (!PyUnicode_IS_READY(uni)) {
11102 const wchar_t *ws = _PyUnicode_WSTR(uni);
11103 /* Compare Unicode string and source character set string */
11104 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11105 if (chr != ustr[i])
11106 return (chr < ustr[i]) ? -1 : 1;
11107 }
11108 /* This check keeps Python strings that end in '\0' from comparing equal
11109 to C strings identical up to that point. */
11110 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11111 return 1; /* uni is longer */
11112 if (ustr[i])
11113 return -1; /* str is longer */
11114 return 0;
11115 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011117 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011118 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011119 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011120 size_t len, len2 = strlen(str);
11121 int cmp;
11122
11123 len = Py_MIN(len1, len2);
11124 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011125 if (cmp != 0) {
11126 if (cmp < 0)
11127 return -1;
11128 else
11129 return 1;
11130 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011131 if (len1 > len2)
11132 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011133 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011134 return -1; /* str is longer */
11135 return 0;
11136 }
11137 else {
11138 void *data = PyUnicode_DATA(uni);
11139 /* Compare Unicode string and source character set string */
11140 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011141 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011142 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11143 /* This check keeps Python strings that end in '\0' from comparing equal
11144 to C strings identical up to that point. */
11145 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11146 return 1; /* uni is longer */
11147 if (str[i])
11148 return -1; /* str is longer */
11149 return 0;
11150 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011151}
11152
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011153static int
11154non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11155{
11156 size_t i, len;
11157 const wchar_t *p;
11158 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11159 if (strlen(str) != len)
11160 return 0;
11161 p = _PyUnicode_WSTR(unicode);
11162 assert(p);
11163 for (i = 0; i < len; i++) {
11164 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011165 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011166 return 0;
11167 }
11168 return 1;
11169}
11170
11171int
11172_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11173{
11174 size_t len;
11175 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011176 assert(str);
11177#ifndef NDEBUG
11178 for (const char *p = str; *p; p++) {
11179 assert((unsigned char)*p < 128);
11180 }
11181#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011182 if (PyUnicode_READY(unicode) == -1) {
11183 /* Memory error or bad data */
11184 PyErr_Clear();
11185 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11186 }
11187 if (!PyUnicode_IS_ASCII(unicode))
11188 return 0;
11189 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11190 return strlen(str) == len &&
11191 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11192}
11193
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011194int
11195_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11196{
11197 PyObject *right_uni;
11198 Py_hash_t hash;
11199
11200 assert(_PyUnicode_CHECK(left));
11201 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011202#ifndef NDEBUG
11203 for (const char *p = right->string; *p; p++) {
11204 assert((unsigned char)*p < 128);
11205 }
11206#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011207
11208 if (PyUnicode_READY(left) == -1) {
11209 /* memory error or bad data */
11210 PyErr_Clear();
11211 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11212 }
11213
11214 if (!PyUnicode_IS_ASCII(left))
11215 return 0;
11216
11217 right_uni = _PyUnicode_FromId(right); /* borrowed */
11218 if (right_uni == NULL) {
11219 /* memory error or bad data */
11220 PyErr_Clear();
11221 return _PyUnicode_EqualToASCIIString(left, right->string);
11222 }
11223
11224 if (left == right_uni)
11225 return 1;
11226
11227 if (PyUnicode_CHECK_INTERNED(left))
11228 return 0;
11229
INADA Naoki7cc95f52018-01-28 02:07:09 +090011230 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011231 hash = _PyUnicode_HASH(left);
11232 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11233 return 0;
11234
11235 return unicode_compare_eq(left, right_uni);
11236}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011237
Alexander Belopolsky40018472011-02-26 01:02:56 +000011238PyObject *
11239PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011240{
11241 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011242
Victor Stinnere5567ad2012-10-23 02:48:49 +020011243 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11244 Py_RETURN_NOTIMPLEMENTED;
11245
11246 if (PyUnicode_READY(left) == -1 ||
11247 PyUnicode_READY(right) == -1)
11248 return NULL;
11249
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011250 if (left == right) {
11251 switch (op) {
11252 case Py_EQ:
11253 case Py_LE:
11254 case Py_GE:
11255 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011256 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011257 case Py_NE:
11258 case Py_LT:
11259 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011260 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011261 default:
11262 PyErr_BadArgument();
11263 return NULL;
11264 }
11265 }
11266 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011267 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011268 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011269 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011270 }
11271 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011272 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011273 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011274 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011275}
11276
Alexander Belopolsky40018472011-02-26 01:02:56 +000011277int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011278_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11279{
11280 return unicode_eq(aa, bb);
11281}
11282
11283int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011284PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011285{
Victor Stinner77282cb2013-04-14 19:22:47 +020011286 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 void *buf1, *buf2;
11288 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011289 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011290
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011293 "'in <string>' requires string as left operand, not %.100s",
11294 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011295 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011296 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011298 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 if (ensure_unicode(str) < 0)
11300 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 kind2 = PyUnicode_KIND(substr);
11304 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011305 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011307 len2 = PyUnicode_GET_LENGTH(substr);
11308 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011309 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011310 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011311 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011312 if (len2 == 1) {
11313 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11314 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011315 return result;
11316 }
11317 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 buf2 = _PyUnicode_AsKind(substr, kind1);
11319 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011320 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322
Victor Stinner77282cb2013-04-14 19:22:47 +020011323 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 case PyUnicode_1BYTE_KIND:
11325 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11326 break;
11327 case PyUnicode_2BYTE_KIND:
11328 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11329 break;
11330 case PyUnicode_4BYTE_KIND:
11331 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11332 break;
11333 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011334 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011336
Victor Stinner77282cb2013-04-14 19:22:47 +020011337 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 PyMem_Free(buf2);
11339
Guido van Rossum403d68b2000-03-13 15:55:09 +000011340 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011341}
11342
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343/* Concat to string or Unicode object giving a new Unicode object. */
11344
Alexander Belopolsky40018472011-02-26 01:02:56 +000011345PyObject *
11346PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011348 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011349 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011350 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011352 if (ensure_unicode(left) < 0)
11353 return NULL;
11354
11355 if (!PyUnicode_Check(right)) {
11356 PyErr_Format(PyExc_TypeError,
11357 "can only concatenate str (not \"%.200s\") to str",
11358 right->ob_type->tp_name);
11359 return NULL;
11360 }
11361 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
11364 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011365 if (left == unicode_empty)
11366 return PyUnicode_FromObject(right);
11367 if (right == unicode_empty)
11368 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011370 left_len = PyUnicode_GET_LENGTH(left);
11371 right_len = PyUnicode_GET_LENGTH(right);
11372 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011373 PyErr_SetString(PyExc_OverflowError,
11374 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011375 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011376 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011378
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011379 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11380 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011381 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011384 result = PyUnicode_New(new_len, maxchar);
11385 if (result == NULL)
11386 return NULL;
11387 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11388 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11389 assert(_PyUnicode_CheckConsistency(result, 1));
11390 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391}
11392
Walter Dörwald1ab83302007-05-18 17:15:44 +000011393void
Victor Stinner23e56682011-10-03 03:54:37 +020011394PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011395{
Victor Stinner23e56682011-10-03 03:54:37 +020011396 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011397 Py_UCS4 maxchar, maxchar2;
11398 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011399
11400 if (p_left == NULL) {
11401 if (!PyErr_Occurred())
11402 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011403 return;
11404 }
Victor Stinner23e56682011-10-03 03:54:37 +020011405 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011406 if (right == NULL || left == NULL
11407 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011408 if (!PyErr_Occurred())
11409 PyErr_BadInternalCall();
11410 goto error;
11411 }
11412
Benjamin Petersonbac79492012-01-14 13:34:47 -050011413 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011414 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011415 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011416 goto error;
11417
Victor Stinner488fa492011-12-12 00:01:39 +010011418 /* Shortcuts */
11419 if (left == unicode_empty) {
11420 Py_DECREF(left);
11421 Py_INCREF(right);
11422 *p_left = right;
11423 return;
11424 }
11425 if (right == unicode_empty)
11426 return;
11427
11428 left_len = PyUnicode_GET_LENGTH(left);
11429 right_len = PyUnicode_GET_LENGTH(right);
11430 if (left_len > PY_SSIZE_T_MAX - right_len) {
11431 PyErr_SetString(PyExc_OverflowError,
11432 "strings are too large to concat");
11433 goto error;
11434 }
11435 new_len = left_len + right_len;
11436
11437 if (unicode_modifiable(left)
11438 && PyUnicode_CheckExact(right)
11439 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011440 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11441 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011442 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011443 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011444 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11445 {
11446 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011447 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011448 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011449
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011450 /* copy 'right' into the newly allocated area of 'left' */
11451 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011452 }
Victor Stinner488fa492011-12-12 00:01:39 +010011453 else {
11454 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11455 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011456 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011457
Victor Stinner488fa492011-12-12 00:01:39 +010011458 /* Concat the two Unicode strings */
11459 res = PyUnicode_New(new_len, maxchar);
11460 if (res == NULL)
11461 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011462 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11463 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011464 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011465 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011466 }
11467 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011468 return;
11469
11470error:
Victor Stinner488fa492011-12-12 00:01:39 +010011471 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011472}
11473
11474void
11475PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11476{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011477 PyUnicode_Append(pleft, right);
11478 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011479}
11480
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011481/*
11482Wraps stringlib_parse_args_finds() and additionally ensures that the
11483first argument is a unicode object.
11484*/
11485
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011486static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011487parse_args_finds_unicode(const char * function_name, PyObject *args,
11488 PyObject **substring,
11489 Py_ssize_t *start, Py_ssize_t *end)
11490{
11491 if(stringlib_parse_args_finds(function_name, args, substring,
11492 start, end)) {
11493 if (ensure_unicode(*substring) < 0)
11494 return 0;
11495 return 1;
11496 }
11497 return 0;
11498}
11499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011503Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011504string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011505interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011508unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011510 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011511 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011512 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011514 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 void *buf1, *buf2;
11516 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011518 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 kind1 = PyUnicode_KIND(self);
11522 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011523 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011524 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 len1 = PyUnicode_GET_LENGTH(self);
11527 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011529 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011530 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011531
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011532 buf1 = PyUnicode_DATA(self);
11533 buf2 = PyUnicode_DATA(substring);
11534 if (kind2 != kind1) {
11535 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011536 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011537 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011538 }
11539 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 case PyUnicode_1BYTE_KIND:
11541 iresult = ucs1lib_count(
11542 ((Py_UCS1*)buf1) + start, end - start,
11543 buf2, len2, PY_SSIZE_T_MAX
11544 );
11545 break;
11546 case PyUnicode_2BYTE_KIND:
11547 iresult = ucs2lib_count(
11548 ((Py_UCS2*)buf1) + start, end - start,
11549 buf2, len2, PY_SSIZE_T_MAX
11550 );
11551 break;
11552 case PyUnicode_4BYTE_KIND:
11553 iresult = ucs4lib_count(
11554 ((Py_UCS4*)buf1) + start, end - start,
11555 buf2, len2, PY_SSIZE_T_MAX
11556 );
11557 break;
11558 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011559 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 }
11561
11562 result = PyLong_FromSsize_t(iresult);
11563
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011564 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 return result;
11568}
11569
INADA Naoki3ae20562017-01-16 20:41:20 +090011570/*[clinic input]
11571str.encode as unicode_encode
11572
11573 encoding: str(c_default="NULL") = 'utf-8'
11574 The encoding in which to encode the string.
11575 errors: str(c_default="NULL") = 'strict'
11576 The error handling scheme to use for encoding errors.
11577 The default is 'strict' meaning that encoding errors raise a
11578 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11579 'xmlcharrefreplace' as well as any other name registered with
11580 codecs.register_error that can handle UnicodeEncodeErrors.
11581
11582Encode the string using the codec registered for encoding.
11583[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
11585static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011586unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011587/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011589 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011590}
11591
INADA Naoki3ae20562017-01-16 20:41:20 +090011592/*[clinic input]
11593str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594
INADA Naoki3ae20562017-01-16 20:41:20 +090011595 tabsize: int = 8
11596
11597Return a copy where all tab characters are expanded using spaces.
11598
11599If tabsize is not given, a tab size of 8 characters is assumed.
11600[clinic start generated code]*/
11601
11602static PyObject *
11603unicode_expandtabs_impl(PyObject *self, int tabsize)
11604/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011606 Py_ssize_t i, j, line_pos, src_len, incr;
11607 Py_UCS4 ch;
11608 PyObject *u;
11609 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011610 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011611 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
Antoine Pitrou22425222011-10-04 19:10:51 +020011613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615
Thomas Wouters7e474022000-07-16 12:04:32 +000011616 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011617 src_len = PyUnicode_GET_LENGTH(self);
11618 i = j = line_pos = 0;
11619 kind = PyUnicode_KIND(self);
11620 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011621 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011622 for (; i < src_len; i++) {
11623 ch = PyUnicode_READ(kind, src_data, i);
11624 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011625 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011626 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011627 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011629 goto overflow;
11630 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011631 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011632 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011636 goto overflow;
11637 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011639 if (ch == '\n' || ch == '\r')
11640 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011642 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011643 if (!found)
11644 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011645
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011647 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 if (!u)
11649 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011650 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651
Antoine Pitroue71d5742011-10-04 15:55:09 +020011652 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653
Antoine Pitroue71d5742011-10-04 15:55:09 +020011654 for (; i < src_len; i++) {
11655 ch = PyUnicode_READ(kind, src_data, i);
11656 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011658 incr = tabsize - (line_pos % tabsize);
11659 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011660 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011661 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011663 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011664 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011665 line_pos++;
11666 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011667 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011668 if (ch == '\n' || ch == '\r')
11669 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011671 }
11672 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011673 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011674
Antoine Pitroue71d5742011-10-04 15:55:09 +020011675 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011676 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678}
11679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011680PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682\n\
11683Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011684such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685arguments start and end are interpreted as in slice notation.\n\
11686\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011687Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688
11689static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011692 /* initialize variables to prevent gcc warning */
11693 PyObject *substring = NULL;
11694 Py_ssize_t start = 0;
11695 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011696 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011698 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011701 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011704 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 if (result == -2)
11707 return NULL;
11708
Christian Heimes217cfd12007-12-02 14:31:20 +000011709 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710}
11711
11712static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011713unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011715 void *data;
11716 enum PyUnicode_Kind kind;
11717 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011718
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011719 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011720 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011722 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011723 if (PyUnicode_READY(self) == -1) {
11724 return NULL;
11725 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011726 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11727 PyErr_SetString(PyExc_IndexError, "string index out of range");
11728 return NULL;
11729 }
11730 kind = PyUnicode_KIND(self);
11731 data = PyUnicode_DATA(self);
11732 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011733 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734}
11735
Guido van Rossumc2504932007-09-18 19:42:40 +000011736/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011737 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011738static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011739unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011741 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011742
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011743#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011744 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011745#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (_PyUnicode_HASH(self) != -1)
11747 return _PyUnicode_HASH(self);
11748 if (PyUnicode_READY(self) == -1)
11749 return -1;
animalizea1d14252019-01-02 20:16:06 +080011750
Christian Heimes985ecdc2013-11-20 11:46:18 +010011751 x = _Py_HashBytes(PyUnicode_DATA(self),
11752 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011754 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
oldkaa0735f2018-02-02 16:52:55 +080011760Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011761such that sub is contained within S[start:end]. Optional\n\
11762arguments start and end are interpreted as in slice notation.\n\
11763\n\
11764Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
11766static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011769 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011770 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011771 PyObject *substring = NULL;
11772 Py_ssize_t start = 0;
11773 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011775 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011778 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011781 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (result == -2)
11784 return NULL;
11785
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 if (result < 0) {
11787 PyErr_SetString(PyExc_ValueError, "substring not found");
11788 return NULL;
11789 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011790
Christian Heimes217cfd12007-12-02 14:31:20 +000011791 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792}
11793
INADA Naoki3ae20562017-01-16 20:41:20 +090011794/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011795str.isascii as unicode_isascii
11796
11797Return True if all characters in the string are ASCII, False otherwise.
11798
11799ASCII characters have code points in the range U+0000-U+007F.
11800Empty string is ASCII too.
11801[clinic start generated code]*/
11802
11803static PyObject *
11804unicode_isascii_impl(PyObject *self)
11805/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11806{
11807 if (PyUnicode_READY(self) == -1) {
11808 return NULL;
11809 }
11810 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11811}
11812
11813/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011814str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
INADA Naoki3ae20562017-01-16 20:41:20 +090011816Return True if the string is a lowercase string, False otherwise.
11817
11818A string is lowercase if all cased characters in the string are lowercase and
11819there is at least one cased character in the string.
11820[clinic start generated code]*/
11821
11822static PyObject *
11823unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011824/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 Py_ssize_t i, length;
11827 int kind;
11828 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 int cased;
11830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 if (PyUnicode_READY(self) == -1)
11832 return NULL;
11833 length = PyUnicode_GET_LENGTH(self);
11834 kind = PyUnicode_KIND(self);
11835 data = PyUnicode_DATA(self);
11836
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 if (length == 1)
11839 return PyBool_FromLong(
11840 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011842 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011844 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011845
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 for (i = 0; i < length; i++) {
11848 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011849
Benjamin Peterson29060642009-01-31 22:14:21 +000011850 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011851 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 else if (!cased && Py_UNICODE_ISLOWER(ch))
11853 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011855 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856}
11857
INADA Naoki3ae20562017-01-16 20:41:20 +090011858/*[clinic input]
11859str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
INADA Naoki3ae20562017-01-16 20:41:20 +090011861Return True if the string is an uppercase string, False otherwise.
11862
11863A string is uppercase if all cased characters in the string are uppercase and
11864there is at least one cased character in the string.
11865[clinic start generated code]*/
11866
11867static PyObject *
11868unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011869/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 Py_ssize_t i, length;
11872 int kind;
11873 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 int cased;
11875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (PyUnicode_READY(self) == -1)
11877 return NULL;
11878 length = PyUnicode_GET_LENGTH(self);
11879 kind = PyUnicode_KIND(self);
11880 data = PyUnicode_DATA(self);
11881
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 if (length == 1)
11884 return PyBool_FromLong(
11885 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011887 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011889 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011890
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 for (i = 0; i < length; i++) {
11893 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011894
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011896 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 else if (!cased && Py_UNICODE_ISUPPER(ch))
11898 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011900 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901}
11902
INADA Naoki3ae20562017-01-16 20:41:20 +090011903/*[clinic input]
11904str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
INADA Naoki3ae20562017-01-16 20:41:20 +090011906Return True if the string is a title-cased string, False otherwise.
11907
11908In a title-cased string, upper- and title-case characters may only
11909follow uncased characters and lowercase characters only cased ones.
11910[clinic start generated code]*/
11911
11912static PyObject *
11913unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011914/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 Py_ssize_t i, length;
11917 int kind;
11918 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 int cased, previous_is_cased;
11920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 if (PyUnicode_READY(self) == -1)
11922 return NULL;
11923 length = PyUnicode_GET_LENGTH(self);
11924 kind = PyUnicode_KIND(self);
11925 data = PyUnicode_DATA(self);
11926
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (length == 1) {
11929 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11930 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11931 (Py_UNICODE_ISUPPER(ch) != 0));
11932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011936 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011937
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 cased = 0;
11939 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 for (i = 0; i < length; i++) {
11941 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011942
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11944 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011945 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 previous_is_cased = 1;
11947 cased = 1;
11948 }
11949 else if (Py_UNICODE_ISLOWER(ch)) {
11950 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011951 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 previous_is_cased = 1;
11953 cased = 1;
11954 }
11955 else
11956 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011958 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959}
11960
INADA Naoki3ae20562017-01-16 20:41:20 +090011961/*[clinic input]
11962str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
INADA Naoki3ae20562017-01-16 20:41:20 +090011964Return True if the string is a whitespace string, False otherwise.
11965
11966A string is whitespace if all characters in the string are whitespace and there
11967is at least one character in the string.
11968[clinic start generated code]*/
11969
11970static PyObject *
11971unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011972/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 Py_ssize_t i, length;
11975 int kind;
11976 void *data;
11977
11978 if (PyUnicode_READY(self) == -1)
11979 return NULL;
11980 length = PyUnicode_GET_LENGTH(self);
11981 kind = PyUnicode_KIND(self);
11982 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (length == 1)
11986 return PyBool_FromLong(
11987 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011989 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011991 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 for (i = 0; i < length; i++) {
11994 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011995 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011996 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011998 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999}
12000
INADA Naoki3ae20562017-01-16 20:41:20 +090012001/*[clinic input]
12002str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012003
INADA Naoki3ae20562017-01-16 20:41:20 +090012004Return True if the string is an alphabetic string, False otherwise.
12005
12006A string is alphabetic if all characters in the string are alphabetic and there
12007is at least one character in the string.
12008[clinic start generated code]*/
12009
12010static PyObject *
12011unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012012/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012013{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 Py_ssize_t i, length;
12015 int kind;
12016 void *data;
12017
12018 if (PyUnicode_READY(self) == -1)
12019 return NULL;
12020 length = PyUnicode_GET_LENGTH(self);
12021 kind = PyUnicode_KIND(self);
12022 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012023
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012024 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 if (length == 1)
12026 return PyBool_FromLong(
12027 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012028
12029 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012031 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 for (i = 0; i < length; i++) {
12034 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012035 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012036 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012037 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012038}
12039
INADA Naoki3ae20562017-01-16 20:41:20 +090012040/*[clinic input]
12041str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012042
INADA Naoki3ae20562017-01-16 20:41:20 +090012043Return True if the string is an alpha-numeric string, False otherwise.
12044
12045A string is alpha-numeric if all characters in the string are alpha-numeric and
12046there is at least one character in the string.
12047[clinic start generated code]*/
12048
12049static PyObject *
12050unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012051/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 int kind;
12054 void *data;
12055 Py_ssize_t len, i;
12056
12057 if (PyUnicode_READY(self) == -1)
12058 return NULL;
12059
12060 kind = PyUnicode_KIND(self);
12061 data = PyUnicode_DATA(self);
12062 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012063
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012064 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012065 if (len == 1) {
12066 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12067 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12068 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012069
12070 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012072 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 for (i = 0; i < len; i++) {
12075 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012076 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012077 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012078 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012079 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012080}
12081
INADA Naoki3ae20562017-01-16 20:41:20 +090012082/*[clinic input]
12083str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
INADA Naoki3ae20562017-01-16 20:41:20 +090012085Return True if the string is a decimal string, False otherwise.
12086
12087A string is a decimal string if all characters in the string are decimal and
12088there is at least one character in the string.
12089[clinic start generated code]*/
12090
12091static PyObject *
12092unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012093/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 Py_ssize_t i, length;
12096 int kind;
12097 void *data;
12098
12099 if (PyUnicode_READY(self) == -1)
12100 return NULL;
12101 length = PyUnicode_GET_LENGTH(self);
12102 kind = PyUnicode_KIND(self);
12103 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 if (length == 1)
12107 return PyBool_FromLong(
12108 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012110 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012112 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 for (i = 0; i < length; i++) {
12115 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012116 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012118 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119}
12120
INADA Naoki3ae20562017-01-16 20:41:20 +090012121/*[clinic input]
12122str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
INADA Naoki3ae20562017-01-16 20:41:20 +090012124Return True if the string is a digit string, False otherwise.
12125
12126A string is a digit string if all characters in the string are digits and there
12127is at least one character in the string.
12128[clinic start generated code]*/
12129
12130static PyObject *
12131unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012132/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 Py_ssize_t i, length;
12135 int kind;
12136 void *data;
12137
12138 if (PyUnicode_READY(self) == -1)
12139 return NULL;
12140 length = PyUnicode_GET_LENGTH(self);
12141 kind = PyUnicode_KIND(self);
12142 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (length == 1) {
12146 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12147 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012150 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012152 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 for (i = 0; i < length; i++) {
12155 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012156 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012158 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159}
12160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161/*[clinic input]
12162str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164Return True if the string is a numeric string, False otherwise.
12165
12166A string is numeric if all characters in the string are numeric and there is at
12167least one character in the string.
12168[clinic start generated code]*/
12169
12170static PyObject *
12171unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012172/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 Py_ssize_t i, length;
12175 int kind;
12176 void *data;
12177
12178 if (PyUnicode_READY(self) == -1)
12179 return NULL;
12180 length = PyUnicode_GET_LENGTH(self);
12181 kind = PyUnicode_KIND(self);
12182 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (length == 1)
12186 return PyBool_FromLong(
12187 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 for (i = 0; i < length; i++) {
12194 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012195 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012197 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198}
12199
Martin v. Löwis47383402007-08-15 07:32:56 +000012200int
12201PyUnicode_IsIdentifier(PyObject *self)
12202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 int kind;
12204 void *data;
12205 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012206 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (PyUnicode_READY(self) == -1) {
12209 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 }
12212
12213 /* Special case for empty strings */
12214 if (PyUnicode_GET_LENGTH(self) == 0)
12215 return 0;
12216 kind = PyUnicode_KIND(self);
12217 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012218
12219 /* PEP 3131 says that the first character must be in
12220 XID_Start and subsequent characters in XID_Continue,
12221 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012222 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012223 letters, digits, underscore). However, given the current
12224 definition of XID_Start and XID_Continue, it is sufficient
12225 to check just for these, except that _ must be allowed
12226 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012228 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012229 return 0;
12230
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012231 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012234 return 1;
12235}
12236
INADA Naoki3ae20562017-01-16 20:41:20 +090012237/*[clinic input]
12238str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012239
INADA Naoki3ae20562017-01-16 20:41:20 +090012240Return True if the string is a valid Python identifier, False otherwise.
12241
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012242Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012243such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012244[clinic start generated code]*/
12245
12246static PyObject *
12247unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012248/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012249{
12250 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12251}
12252
INADA Naoki3ae20562017-01-16 20:41:20 +090012253/*[clinic input]
12254str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012255
INADA Naoki3ae20562017-01-16 20:41:20 +090012256Return True if the string is printable, False otherwise.
12257
12258A string is printable if all of its characters are considered printable in
12259repr() or if it is empty.
12260[clinic start generated code]*/
12261
12262static PyObject *
12263unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012264/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 Py_ssize_t i, length;
12267 int kind;
12268 void *data;
12269
12270 if (PyUnicode_READY(self) == -1)
12271 return NULL;
12272 length = PyUnicode_GET_LENGTH(self);
12273 kind = PyUnicode_KIND(self);
12274 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012275
12276 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 if (length == 1)
12278 return PyBool_FromLong(
12279 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 for (i = 0; i < length; i++) {
12282 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012283 Py_RETURN_FALSE;
12284 }
12285 }
12286 Py_RETURN_TRUE;
12287}
12288
INADA Naoki3ae20562017-01-16 20:41:20 +090012289/*[clinic input]
12290str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291
INADA Naoki3ae20562017-01-16 20:41:20 +090012292 iterable: object
12293 /
12294
12295Concatenate any number of strings.
12296
Martin Panter91a88662017-01-24 00:30:06 +000012297The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012298The result is returned as a new string.
12299
12300Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12301[clinic start generated code]*/
12302
12303static PyObject *
12304unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012305/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306{
INADA Naoki3ae20562017-01-16 20:41:20 +090012307 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308}
12309
Martin v. Löwis18e16552006-02-15 17:27:45 +000012310static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012311unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 if (PyUnicode_READY(self) == -1)
12314 return -1;
12315 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316}
12317
INADA Naoki3ae20562017-01-16 20:41:20 +090012318/*[clinic input]
12319str.ljust as unicode_ljust
12320
12321 width: Py_ssize_t
12322 fillchar: Py_UCS4 = ' '
12323 /
12324
12325Return a left-justified string of length width.
12326
12327Padding is done using the specified fill character (default is a space).
12328[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
12330static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012331unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12332/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012334 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
Victor Stinnerc4b49542011-12-11 22:44:26 +010012337 if (PyUnicode_GET_LENGTH(self) >= width)
12338 return unicode_result_unchanged(self);
12339
12340 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341}
12342
INADA Naoki3ae20562017-01-16 20:41:20 +090012343/*[clinic input]
12344str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345
INADA Naoki3ae20562017-01-16 20:41:20 +090012346Return a copy of the string converted to lowercase.
12347[clinic start generated code]*/
12348
12349static PyObject *
12350unicode_lower_impl(PyObject *self)
12351/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012353 if (PyUnicode_READY(self) == -1)
12354 return NULL;
12355 if (PyUnicode_IS_ASCII(self))
12356 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012357 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358}
12359
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012360#define LEFTSTRIP 0
12361#define RIGHTSTRIP 1
12362#define BOTHSTRIP 2
12363
12364/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012365static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012366
INADA Naoki3ae20562017-01-16 20:41:20 +090012367#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012368
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012369/* externally visible for str.strip(unicode) */
12370PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012371_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 void *data;
12374 int kind;
12375 Py_ssize_t i, j, len;
12376 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012377 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12380 return NULL;
12381
12382 kind = PyUnicode_KIND(self);
12383 data = PyUnicode_DATA(self);
12384 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012385 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12387 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012388 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012389
Benjamin Peterson14339b62009-01-31 16:36:08 +000012390 i = 0;
12391 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012392 while (i < len) {
12393 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12394 if (!BLOOM(sepmask, ch))
12395 break;
12396 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12397 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 i++;
12399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012400 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012401
Benjamin Peterson14339b62009-01-31 16:36:08 +000012402 j = len;
12403 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012404 j--;
12405 while (j >= i) {
12406 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12407 if (!BLOOM(sepmask, ch))
12408 break;
12409 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12410 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012412 }
12413
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012416
Victor Stinner7931d9a2011-11-04 00:22:48 +010012417 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418}
12419
12420PyObject*
12421PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12422{
12423 unsigned char *data;
12424 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012425 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426
Victor Stinnerde636f32011-10-01 03:55:54 +020012427 if (PyUnicode_READY(self) == -1)
12428 return NULL;
12429
Victor Stinner684d5fd2012-05-03 02:32:34 +020012430 length = PyUnicode_GET_LENGTH(self);
12431 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012432
Victor Stinner684d5fd2012-05-03 02:32:34 +020012433 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012434 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435
Victor Stinnerde636f32011-10-01 03:55:54 +020012436 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012437 PyErr_SetString(PyExc_IndexError, "string index out of range");
12438 return NULL;
12439 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012440 if (start >= length || end < start)
12441 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012442
Victor Stinner684d5fd2012-05-03 02:32:34 +020012443 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012444 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012445 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012446 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012447 }
12448 else {
12449 kind = PyUnicode_KIND(self);
12450 data = PyUnicode_1BYTE_DATA(self);
12451 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012452 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012453 length);
12454 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
12457static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012458do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 Py_ssize_t len, i, j;
12461
12462 if (PyUnicode_READY(self) == -1)
12463 return NULL;
12464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012466
Victor Stinnercc7af722013-04-09 22:39:24 +020012467 if (PyUnicode_IS_ASCII(self)) {
12468 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12469
12470 i = 0;
12471 if (striptype != RIGHTSTRIP) {
12472 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012473 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012474 if (!_Py_ascii_whitespace[ch])
12475 break;
12476 i++;
12477 }
12478 }
12479
12480 j = len;
12481 if (striptype != LEFTSTRIP) {
12482 j--;
12483 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012484 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012485 if (!_Py_ascii_whitespace[ch])
12486 break;
12487 j--;
12488 }
12489 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012490 }
12491 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012492 else {
12493 int kind = PyUnicode_KIND(self);
12494 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012495
Victor Stinnercc7af722013-04-09 22:39:24 +020012496 i = 0;
12497 if (striptype != RIGHTSTRIP) {
12498 while (i < len) {
12499 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12500 if (!Py_UNICODE_ISSPACE(ch))
12501 break;
12502 i++;
12503 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012504 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012505
12506 j = len;
12507 if (striptype != LEFTSTRIP) {
12508 j--;
12509 while (j >= i) {
12510 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12511 if (!Py_UNICODE_ISSPACE(ch))
12512 break;
12513 j--;
12514 }
12515 j++;
12516 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012517 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012518
Victor Stinner7931d9a2011-11-04 00:22:48 +010012519 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520}
12521
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012522
12523static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012524do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012525{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012526 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012527 if (PyUnicode_Check(sep))
12528 return _PyUnicode_XStrip(self, striptype, sep);
12529 else {
12530 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012531 "%s arg must be None or str",
12532 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012533 return NULL;
12534 }
12535 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012536
Benjamin Peterson14339b62009-01-31 16:36:08 +000012537 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012538}
12539
12540
INADA Naoki3ae20562017-01-16 20:41:20 +090012541/*[clinic input]
12542str.strip as unicode_strip
12543
12544 chars: object = None
12545 /
12546
Zachary Ware09895c22019-10-09 16:09:00 -050012547Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012548
12549If chars is given and not None, remove characters in chars instead.
12550[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012551
12552static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012553unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012554/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012555{
INADA Naoki3ae20562017-01-16 20:41:20 +090012556 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012557}
12558
12559
INADA Naoki3ae20562017-01-16 20:41:20 +090012560/*[clinic input]
12561str.lstrip as unicode_lstrip
12562
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012563 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012564 /
12565
12566Return a copy of the string with leading whitespace removed.
12567
12568If chars is given and not None, remove characters in chars instead.
12569[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012570
12571static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012572unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012573/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012574{
INADA Naoki3ae20562017-01-16 20:41:20 +090012575 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012576}
12577
12578
INADA Naoki3ae20562017-01-16 20:41:20 +090012579/*[clinic input]
12580str.rstrip as unicode_rstrip
12581
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012582 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012583 /
12584
12585Return a copy of the string with trailing whitespace removed.
12586
12587If chars is given and not None, remove characters in chars instead.
12588[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012589
12590static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012591unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012592/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012593{
INADA Naoki3ae20562017-01-16 20:41:20 +090012594 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012595}
12596
12597
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012599unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012601 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603
Serhiy Storchaka05997252013-01-26 12:14:02 +020012604 if (len < 1)
12605 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606
Victor Stinnerc4b49542011-12-11 22:44:26 +010012607 /* no repeat, return original string */
12608 if (len == 1)
12609 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012610
Benjamin Petersonbac79492012-01-14 13:34:47 -050012611 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 return NULL;
12613
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012614 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012615 PyErr_SetString(PyExc_OverflowError,
12616 "repeated string is too long");
12617 return NULL;
12618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012620
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012621 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622 if (!u)
12623 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012624 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 if (PyUnicode_GET_LENGTH(str) == 1) {
12627 const int kind = PyUnicode_KIND(str);
12628 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012629 if (kind == PyUnicode_1BYTE_KIND) {
12630 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012631 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012632 }
12633 else if (kind == PyUnicode_2BYTE_KIND) {
12634 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012635 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012636 ucs2[n] = fill_char;
12637 } else {
12638 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12639 assert(kind == PyUnicode_4BYTE_KIND);
12640 for (n = 0; n < len; ++n)
12641 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 }
12644 else {
12645 /* number of characters copied this far */
12646 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012647 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012649 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012653 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012654 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656 }
12657
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012658 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012659 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660}
12661
Alexander Belopolsky40018472011-02-26 01:02:56 +000012662PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012663PyUnicode_Replace(PyObject *str,
12664 PyObject *substr,
12665 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012666 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012668 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12669 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012671 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672}
12673
INADA Naoki3ae20562017-01-16 20:41:20 +090012674/*[clinic input]
12675str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676
INADA Naoki3ae20562017-01-16 20:41:20 +090012677 old: unicode
12678 new: unicode
12679 count: Py_ssize_t = -1
12680 Maximum number of occurrences to replace.
12681 -1 (the default value) means replace all occurrences.
12682 /
12683
12684Return a copy with all occurrences of substring old replaced by new.
12685
12686If the optional argument count is given, only the first count occurrences are
12687replaced.
12688[clinic start generated code]*/
12689
12690static PyObject *
12691unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12692 Py_ssize_t count)
12693/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012695 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012697 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698}
12699
Alexander Belopolsky40018472011-02-26 01:02:56 +000012700static PyObject *
12701unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012703 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 Py_ssize_t isize;
12705 Py_ssize_t osize, squote, dquote, i, o;
12706 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012707 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012711 return NULL;
12712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 isize = PyUnicode_GET_LENGTH(unicode);
12714 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 /* Compute length of output, quote characters, and
12717 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012718 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 max = 127;
12720 squote = dquote = 0;
12721 ikind = PyUnicode_KIND(unicode);
12722 for (i = 0; i < isize; i++) {
12723 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012724 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012726 case '\'': squote++; break;
12727 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012729 incr = 2;
12730 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 default:
12732 /* Fast-path ASCII */
12733 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012734 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012736 ;
12737 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012740 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012742 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012744 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012746 if (osize > PY_SSIZE_T_MAX - incr) {
12747 PyErr_SetString(PyExc_OverflowError,
12748 "string is too long to generate repr");
12749 return NULL;
12750 }
12751 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 }
12753
12754 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012755 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012757 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 if (dquote)
12759 /* Both squote and dquote present. Use squote,
12760 and escape them */
12761 osize += squote;
12762 else
12763 quote = '"';
12764 }
Victor Stinner55c08782013-04-14 18:45:39 +020012765 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766
12767 repr = PyUnicode_New(osize, max);
12768 if (repr == NULL)
12769 return NULL;
12770 okind = PyUnicode_KIND(repr);
12771 odata = PyUnicode_DATA(repr);
12772
12773 PyUnicode_WRITE(okind, odata, 0, quote);
12774 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012775 if (unchanged) {
12776 _PyUnicode_FastCopyCharacters(repr, 1,
12777 unicode, 0,
12778 isize);
12779 }
12780 else {
12781 for (i = 0, o = 1; i < isize; i++) {
12782 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783
Victor Stinner55c08782013-04-14 18:45:39 +020012784 /* Escape quotes and backslashes */
12785 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012786 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012788 continue;
12789 }
12790
12791 /* Map special whitespace to '\t', \n', '\r' */
12792 if (ch == '\t') {
12793 PyUnicode_WRITE(okind, odata, o++, '\\');
12794 PyUnicode_WRITE(okind, odata, o++, 't');
12795 }
12796 else if (ch == '\n') {
12797 PyUnicode_WRITE(okind, odata, o++, '\\');
12798 PyUnicode_WRITE(okind, odata, o++, 'n');
12799 }
12800 else if (ch == '\r') {
12801 PyUnicode_WRITE(okind, odata, o++, '\\');
12802 PyUnicode_WRITE(okind, odata, o++, 'r');
12803 }
12804
12805 /* Map non-printable US ASCII to '\xhh' */
12806 else if (ch < ' ' || ch == 0x7F) {
12807 PyUnicode_WRITE(okind, odata, o++, '\\');
12808 PyUnicode_WRITE(okind, odata, o++, 'x');
12809 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12810 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12811 }
12812
12813 /* Copy ASCII characters as-is */
12814 else if (ch < 0x7F) {
12815 PyUnicode_WRITE(okind, odata, o++, ch);
12816 }
12817
12818 /* Non-ASCII characters */
12819 else {
12820 /* Map Unicode whitespace and control characters
12821 (categories Z* and C* except ASCII space)
12822 */
12823 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12824 PyUnicode_WRITE(okind, odata, o++, '\\');
12825 /* Map 8-bit characters to '\xhh' */
12826 if (ch <= 0xff) {
12827 PyUnicode_WRITE(okind, odata, o++, 'x');
12828 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12829 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12830 }
12831 /* Map 16-bit characters to '\uxxxx' */
12832 else if (ch <= 0xffff) {
12833 PyUnicode_WRITE(okind, odata, o++, 'u');
12834 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12836 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12837 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12838 }
12839 /* Map 21-bit characters to '\U00xxxxxx' */
12840 else {
12841 PyUnicode_WRITE(okind, odata, o++, 'U');
12842 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12843 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12844 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12845 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12846 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12847 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12848 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12849 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12850 }
12851 }
12852 /* Copy characters as-is */
12853 else {
12854 PyUnicode_WRITE(okind, odata, o++, ch);
12855 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012856 }
12857 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012860 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012861 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862}
12863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012864PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866\n\
12867Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012868such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869arguments start and end are interpreted as in slice notation.\n\
12870\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012871Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
12873static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012876 /* initialize variables to prevent gcc warning */
12877 PyObject *substring = NULL;
12878 Py_ssize_t start = 0;
12879 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012880 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012882 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012888 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 if (result == -2)
12891 return NULL;
12892
Christian Heimes217cfd12007-12-02 14:31:20 +000012893 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012896PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012899Return the highest index in S where substring sub is found,\n\
12900such that sub is contained within S[start:end]. Optional\n\
12901arguments start and end are interpreted as in slice notation.\n\
12902\n\
12903Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904
12905static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012908 /* initialize variables to prevent gcc warning */
12909 PyObject *substring = NULL;
12910 Py_ssize_t start = 0;
12911 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012912 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012914 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012917 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012920 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 if (result == -2)
12923 return NULL;
12924
Guido van Rossumd57fd912000-03-10 22:53:23 +000012925 if (result < 0) {
12926 PyErr_SetString(PyExc_ValueError, "substring not found");
12927 return NULL;
12928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929
Christian Heimes217cfd12007-12-02 14:31:20 +000012930 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931}
12932
INADA Naoki3ae20562017-01-16 20:41:20 +090012933/*[clinic input]
12934str.rjust as unicode_rjust
12935
12936 width: Py_ssize_t
12937 fillchar: Py_UCS4 = ' '
12938 /
12939
12940Return a right-justified string of length width.
12941
12942Padding is done using the specified fill character (default is a space).
12943[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944
12945static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012946unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12947/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012949 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012950 return NULL;
12951
Victor Stinnerc4b49542011-12-11 22:44:26 +010012952 if (PyUnicode_GET_LENGTH(self) >= width)
12953 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954
Victor Stinnerc4b49542011-12-11 22:44:26 +010012955 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956}
12957
Alexander Belopolsky40018472011-02-26 01:02:56 +000012958PyObject *
12959PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012961 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012964 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965}
12966
INADA Naoki3ae20562017-01-16 20:41:20 +090012967/*[clinic input]
12968str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969
INADA Naoki3ae20562017-01-16 20:41:20 +090012970 sep: object = None
12971 The delimiter according which to split the string.
12972 None (the default value) means split according to any whitespace,
12973 and discard empty strings from the result.
12974 maxsplit: Py_ssize_t = -1
12975 Maximum number of splits to do.
12976 -1 (the default value) means no limit.
12977
12978Return a list of the words in the string, using sep as the delimiter string.
12979[clinic start generated code]*/
12980
12981static PyObject *
12982unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12983/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984{
INADA Naoki3ae20562017-01-16 20:41:20 +090012985 if (sep == Py_None)
12986 return split(self, NULL, maxsplit);
12987 if (PyUnicode_Check(sep))
12988 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012989
Victor Stinner998b8062018-09-12 00:23:25 +020012990 PyErr_Format(PyExc_TypeError,
12991 "must be str or None, not %.100s",
12992 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012994}
12995
Thomas Wouters477c8d52006-05-27 19:21:47 +000012996PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012997PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012998{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012999 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 int kind1, kind2;
13001 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013003
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013004 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013006
Victor Stinner14f8f022011-10-05 20:58:25 +020013007 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 len1 = PyUnicode_GET_LENGTH(str_obj);
13010 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013011 if (kind1 < kind2 || len1 < len2) {
13012 _Py_INCREF_UNICODE_EMPTY();
13013 if (!unicode_empty)
13014 out = NULL;
13015 else {
13016 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13017 Py_DECREF(unicode_empty);
13018 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013019 return out;
13020 }
13021 buf1 = PyUnicode_DATA(str_obj);
13022 buf2 = PyUnicode_DATA(sep_obj);
13023 if (kind2 != kind1) {
13024 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13025 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013026 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013029 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013031 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13032 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13033 else
13034 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035 break;
13036 case PyUnicode_2BYTE_KIND:
13037 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13038 break;
13039 case PyUnicode_4BYTE_KIND:
13040 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13041 break;
13042 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013043 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013045
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013046 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013048
13049 return out;
13050}
13051
13052
13053PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013054PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013057 int kind1, kind2;
13058 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013061 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013064 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 len1 = PyUnicode_GET_LENGTH(str_obj);
13067 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013068 if (kind1 < kind2 || len1 < len2) {
13069 _Py_INCREF_UNICODE_EMPTY();
13070 if (!unicode_empty)
13071 out = NULL;
13072 else {
13073 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13074 Py_DECREF(unicode_empty);
13075 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013076 return out;
13077 }
13078 buf1 = PyUnicode_DATA(str_obj);
13079 buf2 = PyUnicode_DATA(sep_obj);
13080 if (kind2 != kind1) {
13081 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13082 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013083 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013086 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013088 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13089 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13090 else
13091 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 break;
13093 case PyUnicode_2BYTE_KIND:
13094 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13095 break;
13096 case PyUnicode_4BYTE_KIND:
13097 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13098 break;
13099 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013100 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013101 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013102
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013103 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013105
13106 return out;
13107}
13108
INADA Naoki3ae20562017-01-16 20:41:20 +090013109/*[clinic input]
13110str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013111
INADA Naoki3ae20562017-01-16 20:41:20 +090013112 sep: object
13113 /
13114
13115Partition the string into three parts using the given separator.
13116
13117This will search for the separator in the string. If the separator is found,
13118returns a 3-tuple containing the part before the separator, the separator
13119itself, and the part after it.
13120
13121If the separator is not found, returns a 3-tuple containing the original string
13122and two empty strings.
13123[clinic start generated code]*/
13124
13125static PyObject *
13126unicode_partition(PyObject *self, PyObject *sep)
13127/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013128{
INADA Naoki3ae20562017-01-16 20:41:20 +090013129 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013130}
13131
INADA Naoki3ae20562017-01-16 20:41:20 +090013132/*[clinic input]
13133str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013134
INADA Naoki3ae20562017-01-16 20:41:20 +090013135Partition the string into three parts using the given separator.
13136
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013137This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013138the separator is found, returns a 3-tuple containing the part before the
13139separator, the separator itself, and the part after it.
13140
13141If the separator is not found, returns a 3-tuple containing two empty strings
13142and the original string.
13143[clinic start generated code]*/
13144
13145static PyObject *
13146unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013147/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013148{
INADA Naoki3ae20562017-01-16 20:41:20 +090013149 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013150}
13151
Alexander Belopolsky40018472011-02-26 01:02:56 +000013152PyObject *
13153PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013154{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013155 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013156 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013157
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013158 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013159}
13160
INADA Naoki3ae20562017-01-16 20:41:20 +090013161/*[clinic input]
13162str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013163
INADA Naoki3ae20562017-01-16 20:41:20 +090013164Return a list of the words in the string, using sep as the delimiter string.
13165
13166Splits are done starting at the end of the string and working to the front.
13167[clinic start generated code]*/
13168
13169static PyObject *
13170unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13171/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013172{
INADA Naoki3ae20562017-01-16 20:41:20 +090013173 if (sep == Py_None)
13174 return rsplit(self, NULL, maxsplit);
13175 if (PyUnicode_Check(sep))
13176 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013177
Victor Stinner998b8062018-09-12 00:23:25 +020013178 PyErr_Format(PyExc_TypeError,
13179 "must be str or None, not %.100s",
13180 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013181 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013182}
13183
INADA Naoki3ae20562017-01-16 20:41:20 +090013184/*[clinic input]
13185str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013187 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013188
13189Return a list of the lines in the string, breaking at line boundaries.
13190
13191Line breaks are not included in the resulting list unless keepends is given and
13192true.
13193[clinic start generated code]*/
13194
13195static PyObject *
13196unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013197/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013199 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200}
13201
13202static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013203PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013205 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206}
13207
INADA Naoki3ae20562017-01-16 20:41:20 +090013208/*[clinic input]
13209str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210
INADA Naoki3ae20562017-01-16 20:41:20 +090013211Convert uppercase characters to lowercase and lowercase characters to uppercase.
13212[clinic start generated code]*/
13213
13214static PyObject *
13215unicode_swapcase_impl(PyObject *self)
13216/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013218 if (PyUnicode_READY(self) == -1)
13219 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013220 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221}
13222
Larry Hastings61272b72014-01-07 12:41:53 -080013223/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013224
Larry Hastings31826802013-10-19 00:09:25 -070013225@staticmethod
13226str.maketrans as unicode_maketrans
13227
13228 x: object
13229
13230 y: unicode=NULL
13231
13232 z: unicode=NULL
13233
13234 /
13235
13236Return a translation table usable for str.translate().
13237
13238If there is only one argument, it must be a dictionary mapping Unicode
13239ordinals (integers) or characters to Unicode ordinals, strings or None.
13240Character keys will be then converted to ordinals.
13241If there are two arguments, they must be strings of equal length, and
13242in the resulting dictionary, each character in x will be mapped to the
13243character at the same position in y. If there is a third argument, it
13244must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013245[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013246
Larry Hastings31826802013-10-19 00:09:25 -070013247static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013248unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013249/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013250{
Georg Brandlceee0772007-11-27 23:48:05 +000013251 PyObject *new = NULL, *key, *value;
13252 Py_ssize_t i = 0;
13253 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013254
Georg Brandlceee0772007-11-27 23:48:05 +000013255 new = PyDict_New();
13256 if (!new)
13257 return NULL;
13258 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259 int x_kind, y_kind, z_kind;
13260 void *x_data, *y_data, *z_data;
13261
Georg Brandlceee0772007-11-27 23:48:05 +000013262 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013263 if (!PyUnicode_Check(x)) {
13264 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13265 "be a string if there is a second argument");
13266 goto err;
13267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013269 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13270 "arguments must have equal length");
13271 goto err;
13272 }
13273 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274 x_kind = PyUnicode_KIND(x);
13275 y_kind = PyUnicode_KIND(y);
13276 x_data = PyUnicode_DATA(x);
13277 y_data = PyUnicode_DATA(y);
13278 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13279 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013280 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013281 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013282 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013283 if (!value) {
13284 Py_DECREF(key);
13285 goto err;
13286 }
Georg Brandlceee0772007-11-27 23:48:05 +000013287 res = PyDict_SetItem(new, key, value);
13288 Py_DECREF(key);
13289 Py_DECREF(value);
13290 if (res < 0)
13291 goto err;
13292 }
13293 /* create entries for deleting chars in z */
13294 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 z_kind = PyUnicode_KIND(z);
13296 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013297 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013299 if (!key)
13300 goto err;
13301 res = PyDict_SetItem(new, key, Py_None);
13302 Py_DECREF(key);
13303 if (res < 0)
13304 goto err;
13305 }
13306 }
13307 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 int kind;
13309 void *data;
13310
Georg Brandlceee0772007-11-27 23:48:05 +000013311 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013312 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013313 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13314 "to maketrans it must be a dict");
13315 goto err;
13316 }
13317 /* copy entries into the new dict, converting string keys to int keys */
13318 while (PyDict_Next(x, &i, &key, &value)) {
13319 if (PyUnicode_Check(key)) {
13320 /* convert string keys to integer keys */
13321 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013322 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013323 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13324 "table must be of length 1");
13325 goto err;
13326 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 kind = PyUnicode_KIND(key);
13328 data = PyUnicode_DATA(key);
13329 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013330 if (!newkey)
13331 goto err;
13332 res = PyDict_SetItem(new, newkey, value);
13333 Py_DECREF(newkey);
13334 if (res < 0)
13335 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013336 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013337 /* just keep integer keys */
13338 if (PyDict_SetItem(new, key, value) < 0)
13339 goto err;
13340 } else {
13341 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13342 "be strings or integers");
13343 goto err;
13344 }
13345 }
13346 }
13347 return new;
13348 err:
13349 Py_DECREF(new);
13350 return NULL;
13351}
13352
INADA Naoki3ae20562017-01-16 20:41:20 +090013353/*[clinic input]
13354str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355
INADA Naoki3ae20562017-01-16 20:41:20 +090013356 table: object
13357 Translation table, which must be a mapping of Unicode ordinals to
13358 Unicode ordinals, strings, or None.
13359 /
13360
13361Replace each character in the string using the given translation table.
13362
13363The table must implement lookup/indexing via __getitem__, for instance a
13364dictionary or list. If this operation raises LookupError, the character is
13365left untouched. Characters mapped to None are deleted.
13366[clinic start generated code]*/
13367
13368static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013369unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013370/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373}
13374
INADA Naoki3ae20562017-01-16 20:41:20 +090013375/*[clinic input]
13376str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377
INADA Naoki3ae20562017-01-16 20:41:20 +090013378Return a copy of the string converted to uppercase.
13379[clinic start generated code]*/
13380
13381static PyObject *
13382unicode_upper_impl(PyObject *self)
13383/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013385 if (PyUnicode_READY(self) == -1)
13386 return NULL;
13387 if (PyUnicode_IS_ASCII(self))
13388 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013389 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390}
13391
INADA Naoki3ae20562017-01-16 20:41:20 +090013392/*[clinic input]
13393str.zfill as unicode_zfill
13394
13395 width: Py_ssize_t
13396 /
13397
13398Pad a numeric string with zeros on the left, to fill a field of the given width.
13399
13400The string is never truncated.
13401[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013402
13403static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013404unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013405/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013407 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013408 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 int kind;
13410 void *data;
13411 Py_UCS4 chr;
13412
Benjamin Petersonbac79492012-01-14 13:34:47 -050013413 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415
Victor Stinnerc4b49542011-12-11 22:44:26 +010013416 if (PyUnicode_GET_LENGTH(self) >= width)
13417 return unicode_result_unchanged(self);
13418
13419 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420
13421 u = pad(self, fill, 0, '0');
13422
Walter Dörwald068325e2002-04-15 13:36:47 +000013423 if (u == NULL)
13424 return NULL;
13425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 kind = PyUnicode_KIND(u);
13427 data = PyUnicode_DATA(u);
13428 chr = PyUnicode_READ(kind, data, fill);
13429
13430 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013432 PyUnicode_WRITE(kind, data, 0, chr);
13433 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434 }
13435
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013436 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013437 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439
13440#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013441static PyObject *
13442unicode__decimal2ascii(PyObject *self)
13443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013444 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013445}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446#endif
13447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013448PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013451Return True if S starts with the specified prefix, False otherwise.\n\
13452With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013453With optional end, stop comparing S at that position.\n\
13454prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013455
13456static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013457unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013460 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013461 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013462 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013463 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013464 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013465
Jesus Ceaac451502011-04-20 17:09:23 +020013466 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013468 if (PyTuple_Check(subobj)) {
13469 Py_ssize_t i;
13470 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013471 substring = PyTuple_GET_ITEM(subobj, i);
13472 if (!PyUnicode_Check(substring)) {
13473 PyErr_Format(PyExc_TypeError,
13474 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013475 "not %.100s",
13476 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013477 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013478 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013479 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013480 if (result == -1)
13481 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013482 if (result) {
13483 Py_RETURN_TRUE;
13484 }
13485 }
13486 /* nothing matched */
13487 Py_RETURN_FALSE;
13488 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013489 if (!PyUnicode_Check(subobj)) {
13490 PyErr_Format(PyExc_TypeError,
13491 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013492 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013494 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013495 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013496 if (result == -1)
13497 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013498 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013499}
13500
13501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013502PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013505Return True if S ends with the specified suffix, False otherwise.\n\
13506With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013507With optional end, stop comparing S at that position.\n\
13508suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013509
13510static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013511unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013513{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013514 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013515 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013516 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013517 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013518 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519
Jesus Ceaac451502011-04-20 17:09:23 +020013520 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013522 if (PyTuple_Check(subobj)) {
13523 Py_ssize_t i;
13524 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013525 substring = PyTuple_GET_ITEM(subobj, i);
13526 if (!PyUnicode_Check(substring)) {
13527 PyErr_Format(PyExc_TypeError,
13528 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013529 "not %.100s",
13530 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013532 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013533 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013534 if (result == -1)
13535 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013536 if (result) {
13537 Py_RETURN_TRUE;
13538 }
13539 }
13540 Py_RETURN_FALSE;
13541 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013542 if (!PyUnicode_Check(subobj)) {
13543 PyErr_Format(PyExc_TypeError,
13544 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013545 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013547 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013548 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013549 if (result == -1)
13550 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013551 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013552}
13553
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013554static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013555_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013556{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013557 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13558 writer->data = PyUnicode_DATA(writer->buffer);
13559
13560 if (!writer->readonly) {
13561 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013562 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013563 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013564 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013565 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13566 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13567 writer->kind = PyUnicode_WCHAR_KIND;
13568 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13569
Victor Stinner8f674cc2013-04-17 23:02:17 +020013570 /* Copy-on-write mode: set buffer size to 0 so
13571 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13572 * next write. */
13573 writer->size = 0;
13574 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013575}
13576
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013578_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013579{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013581
13582 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013583 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013584
13585 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13586 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13587 writer->kind = PyUnicode_WCHAR_KIND;
13588 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013589}
13590
Inada Naoki770847a2019-06-24 12:30:24 +090013591// Initialize _PyUnicodeWriter with initial buffer
13592static inline void
13593_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13594{
13595 memset(writer, 0, sizeof(*writer));
13596 writer->buffer = buffer;
13597 _PyUnicodeWriter_Update(writer);
13598 writer->min_length = writer->size;
13599}
13600
Victor Stinnerd3f08822012-05-29 12:57:52 +020013601int
13602_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13603 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013604{
13605 Py_ssize_t newlen;
13606 PyObject *newbuffer;
13607
Victor Stinner2740e462016-09-06 16:58:36 -070013608 assert(maxchar <= MAX_UNICODE);
13609
Victor Stinnerca9381e2015-09-22 00:58:32 +020013610 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013611 assert((maxchar > writer->maxchar && length >= 0)
13612 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013613
Victor Stinner202fdca2012-05-07 12:47:02 +020013614 if (length > PY_SSIZE_T_MAX - writer->pos) {
13615 PyErr_NoMemory();
13616 return -1;
13617 }
13618 newlen = writer->pos + length;
13619
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013620 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013621
Victor Stinnerd3f08822012-05-29 12:57:52 +020013622 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013623 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013624 if (writer->overallocate
13625 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13626 /* overallocate to limit the number of realloc() */
13627 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013628 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013629 if (newlen < writer->min_length)
13630 newlen = writer->min_length;
13631
Victor Stinnerd3f08822012-05-29 12:57:52 +020013632 writer->buffer = PyUnicode_New(newlen, maxchar);
13633 if (writer->buffer == NULL)
13634 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013635 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013636 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013637 if (writer->overallocate
13638 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13639 /* overallocate to limit the number of realloc() */
13640 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013641 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013642 if (newlen < writer->min_length)
13643 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013644
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013645 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013646 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013647 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013648 newbuffer = PyUnicode_New(newlen, maxchar);
13649 if (newbuffer == NULL)
13650 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013651 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13652 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013653 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013654 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013655 }
13656 else {
13657 newbuffer = resize_compact(writer->buffer, newlen);
13658 if (newbuffer == NULL)
13659 return -1;
13660 }
13661 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013662 }
13663 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013664 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013665 newbuffer = PyUnicode_New(writer->size, maxchar);
13666 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013667 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013668 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13669 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013670 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013671 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013672 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013673 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013674
13675#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013676}
13677
Victor Stinnerca9381e2015-09-22 00:58:32 +020013678int
13679_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13680 enum PyUnicode_Kind kind)
13681{
13682 Py_UCS4 maxchar;
13683
13684 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13685 assert(writer->kind < kind);
13686
13687 switch (kind)
13688 {
13689 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13690 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13691 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13692 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013693 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013694 }
13695
13696 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13697}
13698
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013699static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013700_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013701{
Victor Stinner2740e462016-09-06 16:58:36 -070013702 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013703 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13704 return -1;
13705 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13706 writer->pos++;
13707 return 0;
13708}
13709
13710int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013711_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13712{
13713 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13714}
13715
13716int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013717_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13718{
13719 Py_UCS4 maxchar;
13720 Py_ssize_t len;
13721
13722 if (PyUnicode_READY(str) == -1)
13723 return -1;
13724 len = PyUnicode_GET_LENGTH(str);
13725 if (len == 0)
13726 return 0;
13727 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13728 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013729 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013730 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013731 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013732 Py_INCREF(str);
13733 writer->buffer = str;
13734 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013735 writer->pos += len;
13736 return 0;
13737 }
13738 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13739 return -1;
13740 }
13741 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13742 str, 0, len);
13743 writer->pos += len;
13744 return 0;
13745}
13746
Victor Stinnere215d962012-10-06 23:03:36 +020013747int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013748_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13749 Py_ssize_t start, Py_ssize_t end)
13750{
13751 Py_UCS4 maxchar;
13752 Py_ssize_t len;
13753
13754 if (PyUnicode_READY(str) == -1)
13755 return -1;
13756
13757 assert(0 <= start);
13758 assert(end <= PyUnicode_GET_LENGTH(str));
13759 assert(start <= end);
13760
13761 if (end == 0)
13762 return 0;
13763
13764 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13765 return _PyUnicodeWriter_WriteStr(writer, str);
13766
13767 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13768 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13769 else
13770 maxchar = writer->maxchar;
13771 len = end - start;
13772
13773 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13774 return -1;
13775
13776 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13777 str, start, len);
13778 writer->pos += len;
13779 return 0;
13780}
13781
13782int
Victor Stinner4a587072013-11-19 12:54:53 +010013783_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13784 const char *ascii, Py_ssize_t len)
13785{
13786 if (len == -1)
13787 len = strlen(ascii);
13788
13789 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13790
13791 if (writer->buffer == NULL && !writer->overallocate) {
13792 PyObject *str;
13793
13794 str = _PyUnicode_FromASCII(ascii, len);
13795 if (str == NULL)
13796 return -1;
13797
13798 writer->readonly = 1;
13799 writer->buffer = str;
13800 _PyUnicodeWriter_Update(writer);
13801 writer->pos += len;
13802 return 0;
13803 }
13804
13805 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13806 return -1;
13807
13808 switch (writer->kind)
13809 {
13810 case PyUnicode_1BYTE_KIND:
13811 {
13812 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13813 Py_UCS1 *data = writer->data;
13814
Christian Heimesf051e432016-09-13 20:22:02 +020013815 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013816 break;
13817 }
13818 case PyUnicode_2BYTE_KIND:
13819 {
13820 _PyUnicode_CONVERT_BYTES(
13821 Py_UCS1, Py_UCS2,
13822 ascii, ascii + len,
13823 (Py_UCS2 *)writer->data + writer->pos);
13824 break;
13825 }
13826 case PyUnicode_4BYTE_KIND:
13827 {
13828 _PyUnicode_CONVERT_BYTES(
13829 Py_UCS1, Py_UCS4,
13830 ascii, ascii + len,
13831 (Py_UCS4 *)writer->data + writer->pos);
13832 break;
13833 }
13834 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013835 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013836 }
13837
13838 writer->pos += len;
13839 return 0;
13840}
13841
13842int
13843_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13844 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013845{
13846 Py_UCS4 maxchar;
13847
13848 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13849 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13850 return -1;
13851 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13852 writer->pos += len;
13853 return 0;
13854}
13855
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013857_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013858{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013859 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013860
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013862 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013863 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013864 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013865
13866 str = writer->buffer;
13867 writer->buffer = NULL;
13868
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013869 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013870 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13871 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013872 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013873
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013874 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13875 PyObject *str2;
13876 str2 = resize_compact(str, writer->pos);
13877 if (str2 == NULL) {
13878 Py_DECREF(str);
13879 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013880 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013881 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013882 }
13883
Victor Stinner15a0bd32013-07-08 22:29:55 +020013884 assert(_PyUnicode_CheckConsistency(str, 1));
13885 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013886}
13887
Victor Stinnerd3f08822012-05-29 12:57:52 +020013888void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013889_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013890{
13891 Py_CLEAR(writer->buffer);
13892}
13893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013894#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013895
13896PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013897 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013898\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013899Return a formatted version of S, using substitutions from args and kwargs.\n\
13900The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013901
Eric Smith27bbca62010-11-04 17:06:58 +000013902PyDoc_STRVAR(format_map__doc__,
13903 "S.format_map(mapping) -> str\n\
13904\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013905Return a formatted version of S, using substitutions from mapping.\n\
13906The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013907
INADA Naoki3ae20562017-01-16 20:41:20 +090013908/*[clinic input]
13909str.__format__ as unicode___format__
13910
13911 format_spec: unicode
13912 /
13913
13914Return a formatted version of the string as described by format_spec.
13915[clinic start generated code]*/
13916
Eric Smith4a7d76d2008-05-30 18:10:19 +000013917static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013918unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013919/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013920{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013921 _PyUnicodeWriter writer;
13922 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013923
Victor Stinnerd3f08822012-05-29 12:57:52 +020013924 if (PyUnicode_READY(self) == -1)
13925 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013926 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013927 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13928 self, format_spec, 0,
13929 PyUnicode_GET_LENGTH(format_spec));
13930 if (ret == -1) {
13931 _PyUnicodeWriter_Dealloc(&writer);
13932 return NULL;
13933 }
13934 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013935}
13936
INADA Naoki3ae20562017-01-16 20:41:20 +090013937/*[clinic input]
13938str.__sizeof__ as unicode_sizeof
13939
13940Return the size of the string in memory, in bytes.
13941[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013942
13943static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013944unicode_sizeof_impl(PyObject *self)
13945/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 Py_ssize_t size;
13948
13949 /* If it's a compact object, account for base structure +
13950 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013951 if (PyUnicode_IS_COMPACT_ASCII(self))
13952 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13953 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013955 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013956 else {
13957 /* If it is a two-block object, account for base object, and
13958 for character block if present. */
13959 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013960 if (_PyUnicode_DATA_ANY(self))
13961 size += (PyUnicode_GET_LENGTH(self) + 1) *
13962 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 }
13964 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013965 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013966 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13967 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13968 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13969 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013970
13971 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013972}
13973
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013974static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013975unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013976{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013977 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013978 if (!copy)
13979 return NULL;
13980 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013981}
13982
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013984 UNICODE_ENCODE_METHODDEF
13985 UNICODE_REPLACE_METHODDEF
13986 UNICODE_SPLIT_METHODDEF
13987 UNICODE_RSPLIT_METHODDEF
13988 UNICODE_JOIN_METHODDEF
13989 UNICODE_CAPITALIZE_METHODDEF
13990 UNICODE_CASEFOLD_METHODDEF
13991 UNICODE_TITLE_METHODDEF
13992 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013993 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013994 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013995 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013996 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013997 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013998 UNICODE_LJUST_METHODDEF
13999 UNICODE_LOWER_METHODDEF
14000 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014001 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14002 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014003 UNICODE_RJUST_METHODDEF
14004 UNICODE_RSTRIP_METHODDEF
14005 UNICODE_RPARTITION_METHODDEF
14006 UNICODE_SPLITLINES_METHODDEF
14007 UNICODE_STRIP_METHODDEF
14008 UNICODE_SWAPCASE_METHODDEF
14009 UNICODE_TRANSLATE_METHODDEF
14010 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014011 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14012 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014013 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014014 UNICODE_ISLOWER_METHODDEF
14015 UNICODE_ISUPPER_METHODDEF
14016 UNICODE_ISTITLE_METHODDEF
14017 UNICODE_ISSPACE_METHODDEF
14018 UNICODE_ISDECIMAL_METHODDEF
14019 UNICODE_ISDIGIT_METHODDEF
14020 UNICODE_ISNUMERIC_METHODDEF
14021 UNICODE_ISALPHA_METHODDEF
14022 UNICODE_ISALNUM_METHODDEF
14023 UNICODE_ISIDENTIFIER_METHODDEF
14024 UNICODE_ISPRINTABLE_METHODDEF
14025 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014026 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014027 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014028 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014029 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014030 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014031#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014032 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014033 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014034#endif
14035
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014036 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014037 {NULL, NULL}
14038};
14039
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014040static PyObject *
14041unicode_mod(PyObject *v, PyObject *w)
14042{
Brian Curtindfc80e32011-08-10 20:28:54 -050014043 if (!PyUnicode_Check(v))
14044 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014045 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014046}
14047
14048static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 0, /*nb_add*/
14050 0, /*nb_subtract*/
14051 0, /*nb_multiply*/
14052 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014053};
14054
Guido van Rossumd57fd912000-03-10 22:53:23 +000014055static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014056 (lenfunc) unicode_length, /* sq_length */
14057 PyUnicode_Concat, /* sq_concat */
14058 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14059 (ssizeargfunc) unicode_getitem, /* sq_item */
14060 0, /* sq_slice */
14061 0, /* sq_ass_item */
14062 0, /* sq_ass_slice */
14063 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014064};
14065
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014066static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014067unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014068{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014069 if (PyUnicode_READY(self) == -1)
14070 return NULL;
14071
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014072 if (PyIndex_Check(item)) {
14073 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014074 if (i == -1 && PyErr_Occurred())
14075 return NULL;
14076 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014077 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014078 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014079 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014080 Py_ssize_t start, stop, step, slicelength, i;
14081 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014082 PyObject *result;
14083 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014084 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014085 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014086
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014087 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014088 return NULL;
14089 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014090 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14091 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014092
14093 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014094 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014095 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014096 slicelength == PyUnicode_GET_LENGTH(self)) {
14097 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014098 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014099 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014100 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014101 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014102 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014103 src_kind = PyUnicode_KIND(self);
14104 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014105 if (!PyUnicode_IS_ASCII(self)) {
14106 kind_limit = kind_maxchar_limit(src_kind);
14107 max_char = 0;
14108 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14109 ch = PyUnicode_READ(src_kind, src_data, cur);
14110 if (ch > max_char) {
14111 max_char = ch;
14112 if (max_char >= kind_limit)
14113 break;
14114 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014115 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014116 }
Victor Stinner55c99112011-10-13 01:17:06 +020014117 else
14118 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014119 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014120 if (result == NULL)
14121 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014122 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014123 dest_data = PyUnicode_DATA(result);
14124
14125 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014126 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14127 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014128 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014129 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014130 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014131 } else {
14132 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14133 return NULL;
14134 }
14135}
14136
14137static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 (lenfunc)unicode_length, /* mp_length */
14139 (binaryfunc)unicode_subscript, /* mp_subscript */
14140 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014141};
14142
Guido van Rossumd57fd912000-03-10 22:53:23 +000014143
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144/* Helpers for PyUnicode_Format() */
14145
Victor Stinnera47082312012-10-04 02:19:54 +020014146struct unicode_formatter_t {
14147 PyObject *args;
14148 int args_owned;
14149 Py_ssize_t arglen, argidx;
14150 PyObject *dict;
14151
14152 enum PyUnicode_Kind fmtkind;
14153 Py_ssize_t fmtcnt, fmtpos;
14154 void *fmtdata;
14155 PyObject *fmtstr;
14156
14157 _PyUnicodeWriter writer;
14158};
14159
14160struct unicode_format_arg_t {
14161 Py_UCS4 ch;
14162 int flags;
14163 Py_ssize_t width;
14164 int prec;
14165 int sign;
14166};
14167
Guido van Rossumd57fd912000-03-10 22:53:23 +000014168static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014169unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014170{
Victor Stinnera47082312012-10-04 02:19:54 +020014171 Py_ssize_t argidx = ctx->argidx;
14172
14173 if (argidx < ctx->arglen) {
14174 ctx->argidx++;
14175 if (ctx->arglen < 0)
14176 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014177 else
Victor Stinnera47082312012-10-04 02:19:54 +020014178 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179 }
14180 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014181 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014182 return NULL;
14183}
14184
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014185/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186
Victor Stinnera47082312012-10-04 02:19:54 +020014187/* Format a float into the writer if the writer is not NULL, or into *p_output
14188 otherwise.
14189
14190 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014191static int
Victor Stinnera47082312012-10-04 02:19:54 +020014192formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14193 PyObject **p_output,
14194 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014196 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014198 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014199 int prec;
14200 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014201
Guido van Rossumd57fd912000-03-10 22:53:23 +000014202 x = PyFloat_AsDouble(v);
14203 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014204 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014205
Victor Stinnera47082312012-10-04 02:19:54 +020014206 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014207 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014208 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014209
Victor Stinnera47082312012-10-04 02:19:54 +020014210 if (arg->flags & F_ALT)
14211 dtoa_flags = Py_DTSF_ALT;
14212 else
14213 dtoa_flags = 0;
14214 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014215 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014216 return -1;
14217 len = strlen(p);
14218 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014219 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014220 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014221 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014222 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014223 }
14224 else
14225 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014226 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014227 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014228}
14229
Victor Stinnerd0880d52012-04-27 23:40:13 +020014230/* formatlong() emulates the format codes d, u, o, x and X, and
14231 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14232 * Python's regular ints.
14233 * Return value: a new PyUnicodeObject*, or NULL if error.
14234 * The output string is of the form
14235 * "-"? ("0x" | "0X")? digit+
14236 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14237 * set in flags. The case of hex digits will be correct,
14238 * There will be at least prec digits, zero-filled on the left if
14239 * necessary to get that many.
14240 * val object to be converted
14241 * flags bitmask of format flags; only F_ALT is looked at
14242 * prec minimum number of digits; 0-fill on left if needed
14243 * type a character in [duoxX]; u acts the same as d
14244 *
14245 * CAUTION: o, x and X conversions on regular ints can never
14246 * produce a '-' sign, but can for Python's unbounded ints.
14247 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014248PyObject *
14249_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014250{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014251 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014253 Py_ssize_t i;
14254 int sign; /* 1 if '-', else 0 */
14255 int len; /* number of characters */
14256 Py_ssize_t llen;
14257 int numdigits; /* len == numnondigits + numdigits */
14258 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014259
Victor Stinnerd0880d52012-04-27 23:40:13 +020014260 /* Avoid exceeding SSIZE_T_MAX */
14261 if (prec > INT_MAX-3) {
14262 PyErr_SetString(PyExc_OverflowError,
14263 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014264 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014265 }
14266
14267 assert(PyLong_Check(val));
14268
14269 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014270 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014271 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014272 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014273 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014274 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014275 /* int and int subclasses should print numerically when a numeric */
14276 /* format code is used (see issue18780) */
14277 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014278 break;
14279 case 'o':
14280 numnondigits = 2;
14281 result = PyNumber_ToBase(val, 8);
14282 break;
14283 case 'x':
14284 case 'X':
14285 numnondigits = 2;
14286 result = PyNumber_ToBase(val, 16);
14287 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014288 }
14289 if (!result)
14290 return NULL;
14291
14292 assert(unicode_modifiable(result));
14293 assert(PyUnicode_IS_READY(result));
14294 assert(PyUnicode_IS_ASCII(result));
14295
14296 /* To modify the string in-place, there can only be one reference. */
14297 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014298 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014299 PyErr_BadInternalCall();
14300 return NULL;
14301 }
14302 buf = PyUnicode_DATA(result);
14303 llen = PyUnicode_GET_LENGTH(result);
14304 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014305 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014306 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014307 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014308 return NULL;
14309 }
14310 len = (int)llen;
14311 sign = buf[0] == '-';
14312 numnondigits += sign;
14313 numdigits = len - numnondigits;
14314 assert(numdigits > 0);
14315
14316 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014317 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014318 (type == 'o' || type == 'x' || type == 'X'))) {
14319 assert(buf[sign] == '0');
14320 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14321 buf[sign+1] == 'o');
14322 numnondigits -= 2;
14323 buf += 2;
14324 len -= 2;
14325 if (sign)
14326 buf[0] = '-';
14327 assert(len == numnondigits + numdigits);
14328 assert(numdigits > 0);
14329 }
14330
14331 /* Fill with leading zeroes to meet minimum width. */
14332 if (prec > numdigits) {
14333 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14334 numnondigits + prec);
14335 char *b1;
14336 if (!r1) {
14337 Py_DECREF(result);
14338 return NULL;
14339 }
14340 b1 = PyBytes_AS_STRING(r1);
14341 for (i = 0; i < numnondigits; ++i)
14342 *b1++ = *buf++;
14343 for (i = 0; i < prec - numdigits; i++)
14344 *b1++ = '0';
14345 for (i = 0; i < numdigits; i++)
14346 *b1++ = *buf++;
14347 *b1 = '\0';
14348 Py_DECREF(result);
14349 result = r1;
14350 buf = PyBytes_AS_STRING(result);
14351 len = numnondigits + prec;
14352 }
14353
14354 /* Fix up case for hex conversions. */
14355 if (type == 'X') {
14356 /* Need to convert all lower case letters to upper case.
14357 and need to convert 0x to 0X (and -0x to -0X). */
14358 for (i = 0; i < len; i++)
14359 if (buf[i] >= 'a' && buf[i] <= 'x')
14360 buf[i] -= 'a'-'A';
14361 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014362 if (!PyUnicode_Check(result)
14363 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014364 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014365 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014366 Py_DECREF(result);
14367 result = unicode;
14368 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014369 else if (len != PyUnicode_GET_LENGTH(result)) {
14370 if (PyUnicode_Resize(&result, len) < 0)
14371 Py_CLEAR(result);
14372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014373 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014374}
14375
Ethan Furmandf3ed242014-01-05 06:50:30 -080014376/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014377 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014378 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379 * -1 and raise an exception on error */
14380static int
Victor Stinnera47082312012-10-04 02:19:54 +020014381mainformatlong(PyObject *v,
14382 struct unicode_format_arg_t *arg,
14383 PyObject **p_output,
14384 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014385{
14386 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014387 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014388
14389 if (!PyNumber_Check(v))
14390 goto wrongtype;
14391
Ethan Furman9ab74802014-03-21 06:38:46 -070014392 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014393 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014394 if (type == 'o' || type == 'x' || type == 'X') {
14395 iobj = PyNumber_Index(v);
14396 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014397 if (PyErr_ExceptionMatches(PyExc_TypeError))
14398 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014399 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014400 }
14401 }
14402 else {
14403 iobj = PyNumber_Long(v);
14404 if (iobj == NULL ) {
14405 if (PyErr_ExceptionMatches(PyExc_TypeError))
14406 goto wrongtype;
14407 return -1;
14408 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014409 }
14410 assert(PyLong_Check(iobj));
14411 }
14412 else {
14413 iobj = v;
14414 Py_INCREF(iobj);
14415 }
14416
14417 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014418 && arg->width == -1 && arg->prec == -1
14419 && !(arg->flags & (F_SIGN | F_BLANK))
14420 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014421 {
14422 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014423 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014424 int base;
14425
Victor Stinnera47082312012-10-04 02:19:54 +020014426 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014427 {
14428 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014429 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014430 case 'd':
14431 case 'i':
14432 case 'u':
14433 base = 10;
14434 break;
14435 case 'o':
14436 base = 8;
14437 break;
14438 case 'x':
14439 case 'X':
14440 base = 16;
14441 break;
14442 }
14443
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014444 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14445 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014446 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014447 }
14448 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014449 return 1;
14450 }
14451
Ethan Furmanb95b5612015-01-23 20:05:18 -080014452 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014453 Py_DECREF(iobj);
14454 if (res == NULL)
14455 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014456 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014457 return 0;
14458
14459wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014460 switch(type)
14461 {
14462 case 'o':
14463 case 'x':
14464 case 'X':
14465 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014466 "%%%c format: an integer is required, "
14467 "not %.200s",
14468 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014469 break;
14470 default:
14471 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014472 "%%%c format: a number is required, "
14473 "not %.200s",
14474 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014475 break;
14476 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014477 return -1;
14478}
14479
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014480static Py_UCS4
14481formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014482{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014483 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014484 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014485 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014486 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014487 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014488 goto onError;
14489 }
14490 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014491 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014492 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014493 /* make sure number is a type of integer */
14494 if (!PyLong_Check(v)) {
14495 iobj = PyNumber_Index(v);
14496 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014497 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014498 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014499 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014500 Py_DECREF(iobj);
14501 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014502 else {
14503 x = PyLong_AsLong(v);
14504 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014505 if (x == -1 && PyErr_Occurred())
14506 goto onError;
14507
Victor Stinner8faf8212011-12-08 22:14:11 +010014508 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014509 PyErr_SetString(PyExc_OverflowError,
14510 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014511 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014512 }
14513
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014514 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014515 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014516
Benjamin Peterson29060642009-01-31 22:14:21 +000014517 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014518 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014519 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014520 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014521}
14522
Victor Stinnera47082312012-10-04 02:19:54 +020014523/* Parse options of an argument: flags, width, precision.
14524 Handle also "%(name)" syntax.
14525
14526 Return 0 if the argument has been formatted into arg->str.
14527 Return 1 if the argument has been written into ctx->writer,
14528 Raise an exception and return -1 on error. */
14529static int
14530unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14531 struct unicode_format_arg_t *arg)
14532{
14533#define FORMAT_READ(ctx) \
14534 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14535
14536 PyObject *v;
14537
Victor Stinnera47082312012-10-04 02:19:54 +020014538 if (arg->ch == '(') {
14539 /* Get argument value from a dictionary. Example: "%(name)s". */
14540 Py_ssize_t keystart;
14541 Py_ssize_t keylen;
14542 PyObject *key;
14543 int pcount = 1;
14544
14545 if (ctx->dict == NULL) {
14546 PyErr_SetString(PyExc_TypeError,
14547 "format requires a mapping");
14548 return -1;
14549 }
14550 ++ctx->fmtpos;
14551 --ctx->fmtcnt;
14552 keystart = ctx->fmtpos;
14553 /* Skip over balanced parentheses */
14554 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14555 arg->ch = FORMAT_READ(ctx);
14556 if (arg->ch == ')')
14557 --pcount;
14558 else if (arg->ch == '(')
14559 ++pcount;
14560 ctx->fmtpos++;
14561 }
14562 keylen = ctx->fmtpos - keystart - 1;
14563 if (ctx->fmtcnt < 0 || pcount > 0) {
14564 PyErr_SetString(PyExc_ValueError,
14565 "incomplete format key");
14566 return -1;
14567 }
14568 key = PyUnicode_Substring(ctx->fmtstr,
14569 keystart, keystart + keylen);
14570 if (key == NULL)
14571 return -1;
14572 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014573 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014574 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014575 }
14576 ctx->args = PyObject_GetItem(ctx->dict, key);
14577 Py_DECREF(key);
14578 if (ctx->args == NULL)
14579 return -1;
14580 ctx->args_owned = 1;
14581 ctx->arglen = -1;
14582 ctx->argidx = -2;
14583 }
14584
14585 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014586 while (--ctx->fmtcnt >= 0) {
14587 arg->ch = FORMAT_READ(ctx);
14588 ctx->fmtpos++;
14589 switch (arg->ch) {
14590 case '-': arg->flags |= F_LJUST; continue;
14591 case '+': arg->flags |= F_SIGN; continue;
14592 case ' ': arg->flags |= F_BLANK; continue;
14593 case '#': arg->flags |= F_ALT; continue;
14594 case '0': arg->flags |= F_ZERO; continue;
14595 }
14596 break;
14597 }
14598
14599 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014600 if (arg->ch == '*') {
14601 v = unicode_format_getnextarg(ctx);
14602 if (v == NULL)
14603 return -1;
14604 if (!PyLong_Check(v)) {
14605 PyErr_SetString(PyExc_TypeError,
14606 "* wants int");
14607 return -1;
14608 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014609 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014610 if (arg->width == -1 && PyErr_Occurred())
14611 return -1;
14612 if (arg->width < 0) {
14613 arg->flags |= F_LJUST;
14614 arg->width = -arg->width;
14615 }
14616 if (--ctx->fmtcnt >= 0) {
14617 arg->ch = FORMAT_READ(ctx);
14618 ctx->fmtpos++;
14619 }
14620 }
14621 else if (arg->ch >= '0' && arg->ch <= '9') {
14622 arg->width = arg->ch - '0';
14623 while (--ctx->fmtcnt >= 0) {
14624 arg->ch = FORMAT_READ(ctx);
14625 ctx->fmtpos++;
14626 if (arg->ch < '0' || arg->ch > '9')
14627 break;
14628 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14629 mixing signed and unsigned comparison. Since arg->ch is between
14630 '0' and '9', casting to int is safe. */
14631 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14632 PyErr_SetString(PyExc_ValueError,
14633 "width too big");
14634 return -1;
14635 }
14636 arg->width = arg->width*10 + (arg->ch - '0');
14637 }
14638 }
14639
14640 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014641 if (arg->ch == '.') {
14642 arg->prec = 0;
14643 if (--ctx->fmtcnt >= 0) {
14644 arg->ch = FORMAT_READ(ctx);
14645 ctx->fmtpos++;
14646 }
14647 if (arg->ch == '*') {
14648 v = unicode_format_getnextarg(ctx);
14649 if (v == NULL)
14650 return -1;
14651 if (!PyLong_Check(v)) {
14652 PyErr_SetString(PyExc_TypeError,
14653 "* wants int");
14654 return -1;
14655 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014656 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014657 if (arg->prec == -1 && PyErr_Occurred())
14658 return -1;
14659 if (arg->prec < 0)
14660 arg->prec = 0;
14661 if (--ctx->fmtcnt >= 0) {
14662 arg->ch = FORMAT_READ(ctx);
14663 ctx->fmtpos++;
14664 }
14665 }
14666 else if (arg->ch >= '0' && arg->ch <= '9') {
14667 arg->prec = arg->ch - '0';
14668 while (--ctx->fmtcnt >= 0) {
14669 arg->ch = FORMAT_READ(ctx);
14670 ctx->fmtpos++;
14671 if (arg->ch < '0' || arg->ch > '9')
14672 break;
14673 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14674 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014675 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014676 return -1;
14677 }
14678 arg->prec = arg->prec*10 + (arg->ch - '0');
14679 }
14680 }
14681 }
14682
14683 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14684 if (ctx->fmtcnt >= 0) {
14685 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14686 if (--ctx->fmtcnt >= 0) {
14687 arg->ch = FORMAT_READ(ctx);
14688 ctx->fmtpos++;
14689 }
14690 }
14691 }
14692 if (ctx->fmtcnt < 0) {
14693 PyErr_SetString(PyExc_ValueError,
14694 "incomplete format");
14695 return -1;
14696 }
14697 return 0;
14698
14699#undef FORMAT_READ
14700}
14701
14702/* Format one argument. Supported conversion specifiers:
14703
14704 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014705 - "i", "d", "u": int or float
14706 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014707 - "e", "E", "f", "F", "g", "G": float
14708 - "c": int or str (1 character)
14709
Victor Stinner8dbd4212012-12-04 09:30:24 +010014710 When possible, the output is written directly into the Unicode writer
14711 (ctx->writer). A string is created when padding is required.
14712
Victor Stinnera47082312012-10-04 02:19:54 +020014713 Return 0 if the argument has been formatted into *p_str,
14714 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014715 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014716static int
14717unicode_format_arg_format(struct unicode_formatter_t *ctx,
14718 struct unicode_format_arg_t *arg,
14719 PyObject **p_str)
14720{
14721 PyObject *v;
14722 _PyUnicodeWriter *writer = &ctx->writer;
14723
14724 if (ctx->fmtcnt == 0)
14725 ctx->writer.overallocate = 0;
14726
Victor Stinnera47082312012-10-04 02:19:54 +020014727 v = unicode_format_getnextarg(ctx);
14728 if (v == NULL)
14729 return -1;
14730
Victor Stinnera47082312012-10-04 02:19:54 +020014731
14732 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014733 case 's':
14734 case 'r':
14735 case 'a':
14736 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14737 /* Fast path */
14738 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14739 return -1;
14740 return 1;
14741 }
14742
14743 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14744 *p_str = v;
14745 Py_INCREF(*p_str);
14746 }
14747 else {
14748 if (arg->ch == 's')
14749 *p_str = PyObject_Str(v);
14750 else if (arg->ch == 'r')
14751 *p_str = PyObject_Repr(v);
14752 else
14753 *p_str = PyObject_ASCII(v);
14754 }
14755 break;
14756
14757 case 'i':
14758 case 'd':
14759 case 'u':
14760 case 'o':
14761 case 'x':
14762 case 'X':
14763 {
14764 int ret = mainformatlong(v, arg, p_str, writer);
14765 if (ret != 0)
14766 return ret;
14767 arg->sign = 1;
14768 break;
14769 }
14770
14771 case 'e':
14772 case 'E':
14773 case 'f':
14774 case 'F':
14775 case 'g':
14776 case 'G':
14777 if (arg->width == -1 && arg->prec == -1
14778 && !(arg->flags & (F_SIGN | F_BLANK)))
14779 {
14780 /* Fast path */
14781 if (formatfloat(v, arg, NULL, writer) == -1)
14782 return -1;
14783 return 1;
14784 }
14785
14786 arg->sign = 1;
14787 if (formatfloat(v, arg, p_str, NULL) == -1)
14788 return -1;
14789 break;
14790
14791 case 'c':
14792 {
14793 Py_UCS4 ch = formatchar(v);
14794 if (ch == (Py_UCS4) -1)
14795 return -1;
14796 if (arg->width == -1 && arg->prec == -1) {
14797 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014798 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014799 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014800 return 1;
14801 }
14802 *p_str = PyUnicode_FromOrdinal(ch);
14803 break;
14804 }
14805
14806 default:
14807 PyErr_Format(PyExc_ValueError,
14808 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014809 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014810 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14811 (int)arg->ch,
14812 ctx->fmtpos - 1);
14813 return -1;
14814 }
14815 if (*p_str == NULL)
14816 return -1;
14817 assert (PyUnicode_Check(*p_str));
14818 return 0;
14819}
14820
14821static int
14822unicode_format_arg_output(struct unicode_formatter_t *ctx,
14823 struct unicode_format_arg_t *arg,
14824 PyObject *str)
14825{
14826 Py_ssize_t len;
14827 enum PyUnicode_Kind kind;
14828 void *pbuf;
14829 Py_ssize_t pindex;
14830 Py_UCS4 signchar;
14831 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014832 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014833 Py_ssize_t sublen;
14834 _PyUnicodeWriter *writer = &ctx->writer;
14835 Py_UCS4 fill;
14836
14837 fill = ' ';
14838 if (arg->sign && arg->flags & F_ZERO)
14839 fill = '0';
14840
14841 if (PyUnicode_READY(str) == -1)
14842 return -1;
14843
14844 len = PyUnicode_GET_LENGTH(str);
14845 if ((arg->width == -1 || arg->width <= len)
14846 && (arg->prec == -1 || arg->prec >= len)
14847 && !(arg->flags & (F_SIGN | F_BLANK)))
14848 {
14849 /* Fast path */
14850 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14851 return -1;
14852 return 0;
14853 }
14854
14855 /* Truncate the string for "s", "r" and "a" formats
14856 if the precision is set */
14857 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14858 if (arg->prec >= 0 && len > arg->prec)
14859 len = arg->prec;
14860 }
14861
14862 /* Adjust sign and width */
14863 kind = PyUnicode_KIND(str);
14864 pbuf = PyUnicode_DATA(str);
14865 pindex = 0;
14866 signchar = '\0';
14867 if (arg->sign) {
14868 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14869 if (ch == '-' || ch == '+') {
14870 signchar = ch;
14871 len--;
14872 pindex++;
14873 }
14874 else if (arg->flags & F_SIGN)
14875 signchar = '+';
14876 else if (arg->flags & F_BLANK)
14877 signchar = ' ';
14878 else
14879 arg->sign = 0;
14880 }
14881 if (arg->width < len)
14882 arg->width = len;
14883
14884 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014885 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014886 if (!(arg->flags & F_LJUST)) {
14887 if (arg->sign) {
14888 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014889 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014890 }
14891 else {
14892 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014893 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014894 }
14895 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014896 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14897 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014898 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014899 }
14900
Victor Stinnera47082312012-10-04 02:19:54 +020014901 buflen = arg->width;
14902 if (arg->sign && len == arg->width)
14903 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014904 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014905 return -1;
14906
14907 /* Write the sign if needed */
14908 if (arg->sign) {
14909 if (fill != ' ') {
14910 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14911 writer->pos += 1;
14912 }
14913 if (arg->width > len)
14914 arg->width--;
14915 }
14916
14917 /* Write the numeric prefix for "x", "X" and "o" formats
14918 if the alternate form is used.
14919 For example, write "0x" for the "%#x" format. */
14920 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14921 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14922 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14923 if (fill != ' ') {
14924 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14925 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14926 writer->pos += 2;
14927 pindex += 2;
14928 }
14929 arg->width -= 2;
14930 if (arg->width < 0)
14931 arg->width = 0;
14932 len -= 2;
14933 }
14934
14935 /* Pad left with the fill character if needed */
14936 if (arg->width > len && !(arg->flags & F_LJUST)) {
14937 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014938 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014939 writer->pos += sublen;
14940 arg->width = len;
14941 }
14942
14943 /* If padding with spaces: write sign if needed and/or numeric prefix if
14944 the alternate form is used */
14945 if (fill == ' ') {
14946 if (arg->sign) {
14947 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14948 writer->pos += 1;
14949 }
14950 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14951 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14952 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14953 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14954 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14955 writer->pos += 2;
14956 pindex += 2;
14957 }
14958 }
14959
14960 /* Write characters */
14961 if (len) {
14962 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14963 str, pindex, len);
14964 writer->pos += len;
14965 }
14966
14967 /* Pad right with the fill character if needed */
14968 if (arg->width > len) {
14969 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014970 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014971 writer->pos += sublen;
14972 }
14973 return 0;
14974}
14975
14976/* Helper of PyUnicode_Format(): format one arg.
14977 Return 0 on success, raise an exception and return -1 on error. */
14978static int
14979unicode_format_arg(struct unicode_formatter_t *ctx)
14980{
14981 struct unicode_format_arg_t arg;
14982 PyObject *str;
14983 int ret;
14984
Victor Stinner8dbd4212012-12-04 09:30:24 +010014985 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014986 if (arg.ch == '%') {
14987 ctx->fmtpos++;
14988 ctx->fmtcnt--;
14989 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14990 return -1;
14991 return 0;
14992 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014993 arg.flags = 0;
14994 arg.width = -1;
14995 arg.prec = -1;
14996 arg.sign = 0;
14997 str = NULL;
14998
Victor Stinnera47082312012-10-04 02:19:54 +020014999 ret = unicode_format_arg_parse(ctx, &arg);
15000 if (ret == -1)
15001 return -1;
15002
15003 ret = unicode_format_arg_format(ctx, &arg, &str);
15004 if (ret == -1)
15005 return -1;
15006
15007 if (ret != 1) {
15008 ret = unicode_format_arg_output(ctx, &arg, str);
15009 Py_DECREF(str);
15010 if (ret == -1)
15011 return -1;
15012 }
15013
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015014 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015015 PyErr_SetString(PyExc_TypeError,
15016 "not all arguments converted during string formatting");
15017 return -1;
15018 }
15019 return 0;
15020}
15021
Alexander Belopolsky40018472011-02-26 01:02:56 +000015022PyObject *
15023PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015024{
Victor Stinnera47082312012-10-04 02:19:54 +020015025 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015026
Guido van Rossumd57fd912000-03-10 22:53:23 +000015027 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015028 PyErr_BadInternalCall();
15029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015030 }
Victor Stinnera47082312012-10-04 02:19:54 +020015031
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015032 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015033 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015034
15035 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015036 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15037 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15038 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15039 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015040
Victor Stinner8f674cc2013-04-17 23:02:17 +020015041 _PyUnicodeWriter_Init(&ctx.writer);
15042 ctx.writer.min_length = ctx.fmtcnt + 100;
15043 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015044
Guido van Rossumd57fd912000-03-10 22:53:23 +000015045 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015046 ctx.arglen = PyTuple_Size(args);
15047 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015048 }
15049 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015050 ctx.arglen = -1;
15051 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015052 }
Victor Stinnera47082312012-10-04 02:19:54 +020015053 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015054 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015055 ctx.dict = args;
15056 else
15057 ctx.dict = NULL;
15058 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015059
Victor Stinnera47082312012-10-04 02:19:54 +020015060 while (--ctx.fmtcnt >= 0) {
15061 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015062 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015063
15064 nonfmtpos = ctx.fmtpos++;
15065 while (ctx.fmtcnt >= 0 &&
15066 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15067 ctx.fmtpos++;
15068 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015069 }
Victor Stinnera47082312012-10-04 02:19:54 +020015070 if (ctx.fmtcnt < 0) {
15071 ctx.fmtpos--;
15072 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015073 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015074
Victor Stinnercfc4c132013-04-03 01:48:39 +020015075 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15076 nonfmtpos, ctx.fmtpos) < 0)
15077 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 }
15079 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015080 ctx.fmtpos++;
15081 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015082 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015083 }
15084 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015085
Victor Stinnera47082312012-10-04 02:19:54 +020015086 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015087 PyErr_SetString(PyExc_TypeError,
15088 "not all arguments converted during string formatting");
15089 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015090 }
15091
Victor Stinnera47082312012-10-04 02:19:54 +020015092 if (ctx.args_owned) {
15093 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015094 }
Victor Stinnera47082312012-10-04 02:19:54 +020015095 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015096
Benjamin Peterson29060642009-01-31 22:14:21 +000015097 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015098 _PyUnicodeWriter_Dealloc(&ctx.writer);
15099 if (ctx.args_owned) {
15100 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015101 }
15102 return NULL;
15103}
15104
Jeremy Hylton938ace62002-07-17 16:30:39 +000015105static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015106unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15107
Tim Peters6d6c1a32001-08-02 04:15:00 +000015108static PyObject *
15109unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15110{
Benjamin Peterson29060642009-01-31 22:14:21 +000015111 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015112 static char *kwlist[] = {"object", "encoding", "errors", 0};
15113 char *encoding = NULL;
15114 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015115
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 if (type != &PyUnicode_Type)
15117 return unicode_subtype_new(type, args, kwds);
15118 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015119 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015120 return NULL;
15121 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015122 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 if (encoding == NULL && errors == NULL)
15124 return PyObject_Str(x);
15125 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015126 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015127}
15128
Guido van Rossume023fe02001-08-30 03:12:59 +000015129static PyObject *
15130unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15131{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015132 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015133 Py_ssize_t length, char_size;
15134 int share_wstr, share_utf8;
15135 unsigned int kind;
15136 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015137
Benjamin Peterson14339b62009-01-31 16:36:08 +000015138 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015139
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015140 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015141 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015143 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015144 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015145 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015146 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015147 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015148
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015149 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015150 if (self == NULL) {
15151 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 return NULL;
15153 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015154 kind = PyUnicode_KIND(unicode);
15155 length = PyUnicode_GET_LENGTH(unicode);
15156
15157 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015158#ifdef Py_DEBUG
15159 _PyUnicode_HASH(self) = -1;
15160#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015161 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015162#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015163 _PyUnicode_STATE(self).interned = 0;
15164 _PyUnicode_STATE(self).kind = kind;
15165 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015166 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015167 _PyUnicode_STATE(self).ready = 1;
15168 _PyUnicode_WSTR(self) = NULL;
15169 _PyUnicode_UTF8_LENGTH(self) = 0;
15170 _PyUnicode_UTF8(self) = NULL;
15171 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015172 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015173
15174 share_utf8 = 0;
15175 share_wstr = 0;
15176 if (kind == PyUnicode_1BYTE_KIND) {
15177 char_size = 1;
15178 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15179 share_utf8 = 1;
15180 }
15181 else if (kind == PyUnicode_2BYTE_KIND) {
15182 char_size = 2;
15183 if (sizeof(wchar_t) == 2)
15184 share_wstr = 1;
15185 }
15186 else {
15187 assert(kind == PyUnicode_4BYTE_KIND);
15188 char_size = 4;
15189 if (sizeof(wchar_t) == 4)
15190 share_wstr = 1;
15191 }
15192
15193 /* Ensure we won't overflow the length. */
15194 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15195 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015196 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015198 data = PyObject_MALLOC((length + 1) * char_size);
15199 if (data == NULL) {
15200 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015201 goto onError;
15202 }
15203
Victor Stinnerc3c74152011-10-02 20:39:55 +020015204 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015205 if (share_utf8) {
15206 _PyUnicode_UTF8_LENGTH(self) = length;
15207 _PyUnicode_UTF8(self) = data;
15208 }
15209 if (share_wstr) {
15210 _PyUnicode_WSTR_LENGTH(self) = length;
15211 _PyUnicode_WSTR(self) = (wchar_t *)data;
15212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015213
Christian Heimesf051e432016-09-13 20:22:02 +020015214 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015215 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015216 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015217#ifdef Py_DEBUG
15218 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15219#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015220 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015221 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015222
15223onError:
15224 Py_DECREF(unicode);
15225 Py_DECREF(self);
15226 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015227}
15228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015229PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015230"str(object='') -> str\n\
15231str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015232\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015233Create a new string object from the given object. If encoding or\n\
15234errors is specified, then the object must expose a data buffer\n\
15235that will be decoded using the given encoding and error handler.\n\
15236Otherwise, returns the result of object.__str__() (if defined)\n\
15237or repr(object).\n\
15238encoding defaults to sys.getdefaultencoding().\n\
15239errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015240
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015241static PyObject *unicode_iter(PyObject *seq);
15242
Guido van Rossumd57fd912000-03-10 22:53:23 +000015243PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015244 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015245 "str", /* tp_name */
15246 sizeof(PyUnicodeObject), /* tp_basicsize */
15247 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015248 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015249 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015250 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015251 0, /* tp_getattr */
15252 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015253 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015254 unicode_repr, /* tp_repr */
15255 &unicode_as_number, /* tp_as_number */
15256 &unicode_as_sequence, /* tp_as_sequence */
15257 &unicode_as_mapping, /* tp_as_mapping */
15258 (hashfunc) unicode_hash, /* tp_hash*/
15259 0, /* tp_call*/
15260 (reprfunc) unicode_str, /* tp_str */
15261 PyObject_GenericGetAttr, /* tp_getattro */
15262 0, /* tp_setattro */
15263 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015265 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15266 unicode_doc, /* tp_doc */
15267 0, /* tp_traverse */
15268 0, /* tp_clear */
15269 PyUnicode_RichCompare, /* tp_richcompare */
15270 0, /* tp_weaklistoffset */
15271 unicode_iter, /* tp_iter */
15272 0, /* tp_iternext */
15273 unicode_methods, /* tp_methods */
15274 0, /* tp_members */
15275 0, /* tp_getset */
15276 &PyBaseObject_Type, /* tp_base */
15277 0, /* tp_dict */
15278 0, /* tp_descr_get */
15279 0, /* tp_descr_set */
15280 0, /* tp_dictoffset */
15281 0, /* tp_init */
15282 0, /* tp_alloc */
15283 unicode_new, /* tp_new */
15284 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015285};
15286
15287/* Initialize the Unicode implementation */
15288
Victor Stinner331a6a52019-05-27 16:39:22 +020015289PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015290_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015291{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015292 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015293 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015294 0x000A, /* LINE FEED */
15295 0x000D, /* CARRIAGE RETURN */
15296 0x001C, /* FILE SEPARATOR */
15297 0x001D, /* GROUP SEPARATOR */
15298 0x001E, /* RECORD SEPARATOR */
15299 0x0085, /* NEXT LINE */
15300 0x2028, /* LINE SEPARATOR */
15301 0x2029, /* PARAGRAPH SEPARATOR */
15302 };
15303
Fred Drakee4315f52000-05-09 19:53:39 +000015304 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015305 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015306 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015307 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015308 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015309 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015310
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015311 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015312 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015313 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015314
15315 /* initialize the linebreak bloom filter */
15316 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015317 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015318 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015319
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015320 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015321 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015322 }
15323 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015324 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015325 }
15326 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015327 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015328 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015329 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015330}
15331
15332/* Finalize the Unicode implementation */
15333
Christian Heimesa156e092008-02-16 07:38:31 +000015334int
15335PyUnicode_ClearFreeList(void)
15336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015337 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015338}
15339
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015340
Walter Dörwald16807132007-05-25 13:52:07 +000015341void
15342PyUnicode_InternInPlace(PyObject **p)
15343{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015344 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015346#ifdef Py_DEBUG
15347 assert(s != NULL);
15348 assert(_PyUnicode_CHECK(s));
15349#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015351 return;
15352#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 /* If it's a subclass, we don't really know what putting
15354 it in the interned dict might do. */
15355 if (!PyUnicode_CheckExact(s))
15356 return;
15357 if (PyUnicode_CHECK_INTERNED(s))
15358 return;
15359 if (interned == NULL) {
15360 interned = PyDict_New();
15361 if (interned == NULL) {
15362 PyErr_Clear(); /* Don't leave an exception */
15363 return;
15364 }
15365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015367 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015368 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015369 if (t == NULL) {
15370 PyErr_Clear();
15371 return;
15372 }
15373 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015374 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015375 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015376 return;
15377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015378 /* The two references in interned are not counted by refcnt.
15379 The deallocator will take care of this */
15380 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015381 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015382}
15383
15384void
15385PyUnicode_InternImmortal(PyObject **p)
15386{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015387 PyUnicode_InternInPlace(p);
15388 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015389 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 Py_INCREF(*p);
15391 }
Walter Dörwald16807132007-05-25 13:52:07 +000015392}
15393
15394PyObject *
15395PyUnicode_InternFromString(const char *cp)
15396{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015397 PyObject *s = PyUnicode_FromString(cp);
15398 if (s == NULL)
15399 return NULL;
15400 PyUnicode_InternInPlace(&s);
15401 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015402}
15403
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015404
15405#if defined(WITH_VALGRIND) || defined(__INSURE__)
15406static void
15407unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015408{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015410 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015411 Py_ssize_t i, n;
15412 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015413
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 if (interned == NULL || !PyDict_Check(interned))
15415 return;
15416 keys = PyDict_Keys(interned);
15417 if (keys == NULL || !PyList_Check(keys)) {
15418 PyErr_Clear();
15419 return;
15420 }
Walter Dörwald16807132007-05-25 13:52:07 +000015421
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015422 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 detector, interned unicode strings are not forcibly deallocated;
15424 rather, we give them their stolen references back, and then clear
15425 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015426
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015428#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015429 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015430 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015431#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015433 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015434 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015435 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015437 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015438 case SSTATE_NOT_INTERNED:
15439 /* XXX Shouldn't happen */
15440 break;
15441 case SSTATE_INTERNED_IMMORTAL:
15442 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015443 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 break;
15445 case SSTATE_INTERNED_MORTAL:
15446 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015447 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015448 break;
15449 default:
15450 Py_FatalError("Inconsistent interned string state.");
15451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015452 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015454#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 fprintf(stderr, "total size of all interned strings: "
15456 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15457 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015458#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 Py_DECREF(keys);
15460 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015461 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015462}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015463#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015464
15465
15466/********************* Unicode Iterator **************************/
15467
15468typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 PyObject_HEAD
15470 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015471 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015472} unicodeiterobject;
15473
15474static void
15475unicodeiter_dealloc(unicodeiterobject *it)
15476{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015477 _PyObject_GC_UNTRACK(it);
15478 Py_XDECREF(it->it_seq);
15479 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015480}
15481
15482static int
15483unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15484{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015485 Py_VISIT(it->it_seq);
15486 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015487}
15488
15489static PyObject *
15490unicodeiter_next(unicodeiterobject *it)
15491{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015492 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015493
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 assert(it != NULL);
15495 seq = it->it_seq;
15496 if (seq == NULL)
15497 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015498 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015500 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15501 int kind = PyUnicode_KIND(seq);
15502 void *data = PyUnicode_DATA(seq);
15503 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15504 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 if (item != NULL)
15506 ++it->it_index;
15507 return item;
15508 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015509
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015511 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015513}
15514
15515static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015516unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015517{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015518 Py_ssize_t len = 0;
15519 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015520 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015521 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015522}
15523
15524PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15525
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015526static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015527unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015528{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015529 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015530 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015531 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015532 it->it_seq, it->it_index);
15533 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015534 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015535 if (u == NULL)
15536 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015537 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015538 }
15539}
15540
15541PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15542
15543static PyObject *
15544unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15545{
15546 Py_ssize_t index = PyLong_AsSsize_t(state);
15547 if (index == -1 && PyErr_Occurred())
15548 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015549 if (it->it_seq != NULL) {
15550 if (index < 0)
15551 index = 0;
15552 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15553 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15554 it->it_index = index;
15555 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015556 Py_RETURN_NONE;
15557}
15558
15559PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15560
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015561static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015562 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015563 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015564 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15565 reduce_doc},
15566 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15567 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015568 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015569};
15570
15571PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15573 "str_iterator", /* tp_name */
15574 sizeof(unicodeiterobject), /* tp_basicsize */
15575 0, /* tp_itemsize */
15576 /* methods */
15577 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015578 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015579 0, /* tp_getattr */
15580 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015581 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015582 0, /* tp_repr */
15583 0, /* tp_as_number */
15584 0, /* tp_as_sequence */
15585 0, /* tp_as_mapping */
15586 0, /* tp_hash */
15587 0, /* tp_call */
15588 0, /* tp_str */
15589 PyObject_GenericGetAttr, /* tp_getattro */
15590 0, /* tp_setattro */
15591 0, /* tp_as_buffer */
15592 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15593 0, /* tp_doc */
15594 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15595 0, /* tp_clear */
15596 0, /* tp_richcompare */
15597 0, /* tp_weaklistoffset */
15598 PyObject_SelfIter, /* tp_iter */
15599 (iternextfunc)unicodeiter_next, /* tp_iternext */
15600 unicodeiter_methods, /* tp_methods */
15601 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015602};
15603
15604static PyObject *
15605unicode_iter(PyObject *seq)
15606{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015607 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015608
Benjamin Peterson14339b62009-01-31 16:36:08 +000015609 if (!PyUnicode_Check(seq)) {
15610 PyErr_BadInternalCall();
15611 return NULL;
15612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015613 if (PyUnicode_READY(seq) == -1)
15614 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015615 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15616 if (it == NULL)
15617 return NULL;
15618 it->it_index = 0;
15619 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015620 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015621 _PyObject_GC_TRACK(it);
15622 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015623}
15624
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015625
15626size_t
15627Py_UNICODE_strlen(const Py_UNICODE *u)
15628{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015629 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015630}
15631
15632Py_UNICODE*
15633Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15634{
15635 Py_UNICODE *u = s1;
15636 while ((*u++ = *s2++));
15637 return s1;
15638}
15639
15640Py_UNICODE*
15641Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15642{
15643 Py_UNICODE *u = s1;
15644 while ((*u++ = *s2++))
15645 if (n-- == 0)
15646 break;
15647 return s1;
15648}
15649
15650Py_UNICODE*
15651Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15652{
15653 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015654 u1 += wcslen(u1);
15655 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015656 return s1;
15657}
15658
15659int
15660Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15661{
15662 while (*s1 && *s2 && *s1 == *s2)
15663 s1++, s2++;
15664 if (*s1 && *s2)
15665 return (*s1 < *s2) ? -1 : +1;
15666 if (*s1)
15667 return 1;
15668 if (*s2)
15669 return -1;
15670 return 0;
15671}
15672
15673int
15674Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15675{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015676 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015677 for (; n != 0; n--) {
15678 u1 = *s1;
15679 u2 = *s2;
15680 if (u1 != u2)
15681 return (u1 < u2) ? -1 : +1;
15682 if (u1 == '\0')
15683 return 0;
15684 s1++;
15685 s2++;
15686 }
15687 return 0;
15688}
15689
15690Py_UNICODE*
15691Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15692{
15693 const Py_UNICODE *p;
15694 for (p = s; *p; p++)
15695 if (*p == c)
15696 return (Py_UNICODE*)p;
15697 return NULL;
15698}
15699
15700Py_UNICODE*
15701Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15702{
15703 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015704 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015705 while (p != s) {
15706 p--;
15707 if (*p == c)
15708 return (Py_UNICODE*)p;
15709 }
15710 return NULL;
15711}
Victor Stinner331ea922010-08-10 16:37:20 +000015712
Victor Stinner71133ff2010-09-01 23:43:53 +000015713Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015714PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015715{
Victor Stinner577db2c2011-10-11 22:12:48 +020015716 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015717 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015719 if (!PyUnicode_Check(unicode)) {
15720 PyErr_BadArgument();
15721 return NULL;
15722 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015723 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015724 if (u == NULL)
15725 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015726 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015727 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015728 PyErr_NoMemory();
15729 return NULL;
15730 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015731 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015732 size *= sizeof(Py_UNICODE);
15733 copy = PyMem_Malloc(size);
15734 if (copy == NULL) {
15735 PyErr_NoMemory();
15736 return NULL;
15737 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015738 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015739 return copy;
15740}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015741
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015742
Victor Stinner709d23d2019-05-02 14:56:30 -040015743static int
15744encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015745{
Victor Stinner709d23d2019-05-02 14:56:30 -040015746 int res;
15747 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15748 if (res == -2) {
15749 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15750 return -1;
15751 }
15752 if (res < 0) {
15753 PyErr_NoMemory();
15754 return -1;
15755 }
15756 return 0;
15757}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015758
Victor Stinner709d23d2019-05-02 14:56:30 -040015759
15760static int
15761config_get_codec_name(wchar_t **config_encoding)
15762{
15763 char *encoding;
15764 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15765 return -1;
15766 }
15767
15768 PyObject *name_obj = NULL;
15769 PyObject *codec = _PyCodec_Lookup(encoding);
15770 PyMem_RawFree(encoding);
15771
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015772 if (!codec)
15773 goto error;
15774
15775 name_obj = PyObject_GetAttrString(codec, "name");
15776 Py_CLEAR(codec);
15777 if (!name_obj) {
15778 goto error;
15779 }
15780
Victor Stinner709d23d2019-05-02 14:56:30 -040015781 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15782 Py_DECREF(name_obj);
15783 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015784 goto error;
15785 }
15786
Victor Stinner709d23d2019-05-02 14:56:30 -040015787 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15788 if (raw_wname == NULL) {
15789 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015790 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015791 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015792 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015793
15794 PyMem_RawFree(*config_encoding);
15795 *config_encoding = raw_wname;
15796
15797 PyMem_Free(wname);
15798 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015799
15800error:
15801 Py_XDECREF(codec);
15802 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015804}
15805
15806
Victor Stinner331a6a52019-05-27 16:39:22 +020015807static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015808init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015809{
Victor Stinner709d23d2019-05-02 14:56:30 -040015810 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015811 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015812 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015813 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015814 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015815 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015816 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015817}
15818
15819
Victor Stinner709d23d2019-05-02 14:56:30 -040015820static int
15821init_fs_codec(PyInterpreterState *interp)
15822{
Victor Stinner331a6a52019-05-27 16:39:22 +020015823 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015824
15825 _Py_error_handler error_handler;
15826 error_handler = get_error_handler_wide(config->filesystem_errors);
15827 if (error_handler == _Py_ERROR_UNKNOWN) {
15828 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15829 return -1;
15830 }
15831
15832 char *encoding, *errors;
15833 if (encode_wstr_utf8(config->filesystem_encoding,
15834 &encoding,
15835 "filesystem_encoding") < 0) {
15836 return -1;
15837 }
15838
15839 if (encode_wstr_utf8(config->filesystem_errors,
15840 &errors,
15841 "filesystem_errors") < 0) {
15842 PyMem_RawFree(encoding);
15843 return -1;
15844 }
15845
15846 PyMem_RawFree(interp->fs_codec.encoding);
15847 interp->fs_codec.encoding = encoding;
15848 PyMem_RawFree(interp->fs_codec.errors);
15849 interp->fs_codec.errors = errors;
15850 interp->fs_codec.error_handler = error_handler;
15851
15852 /* At this point, PyUnicode_EncodeFSDefault() and
15853 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15854 the C implementation of the filesystem encoding. */
15855
15856 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15857 global configuration variables. */
15858 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15859 interp->fs_codec.errors) < 0) {
15860 PyErr_NoMemory();
15861 return -1;
15862 }
15863 return 0;
15864}
15865
15866
Victor Stinner331a6a52019-05-27 16:39:22 +020015867static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015868init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015869{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015870 PyInterpreterState *interp = tstate->interp;
15871
Victor Stinner709d23d2019-05-02 14:56:30 -040015872 /* Update the filesystem encoding to the normalized Python codec name.
15873 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15874 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015875 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015876 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015877 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015878 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015879 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015880 }
15881
Victor Stinner709d23d2019-05-02 14:56:30 -040015882 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015883 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015884 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015885 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015886}
15887
15888
Victor Stinner331a6a52019-05-27 16:39:22 +020015889PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015890_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015891{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015892 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015893 if (_PyStatus_EXCEPTION(status)) {
15894 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015895 }
15896
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015897 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015898}
15899
15900
Victor Stinner709d23d2019-05-02 14:56:30 -040015901#ifdef MS_WINDOWS
15902int
15903_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15904{
15905 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015906 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015907
15908 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15909 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15910 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15911 if (encoding == NULL || errors == NULL) {
15912 PyMem_RawFree(encoding);
15913 PyMem_RawFree(errors);
15914 PyErr_NoMemory();
15915 return -1;
15916 }
15917
15918 PyMem_RawFree(config->filesystem_encoding);
15919 config->filesystem_encoding = encoding;
15920 PyMem_RawFree(config->filesystem_errors);
15921 config->filesystem_errors = errors;
15922
15923 return init_fs_codec(interp);
15924}
15925#endif
15926
15927
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015928void
15929_PyUnicode_Fini(void)
15930{
15931#if defined(WITH_VALGRIND) || defined(__INSURE__)
15932 /* Insure++ is a memory analysis tool that aids in discovering
15933 * memory leaks and other memory problems. On Python exit, the
15934 * interned string dictionaries are flagged as being in use at exit
15935 * (which it is). Under normal circumstances, this is fine because
15936 * the memory will be automatically reclaimed by the system. Under
15937 * memory debugging, it's a huge source of useless noise, so we
15938 * trade off slower shutdown for less distraction in the memory
15939 * reports. -baw
15940 */
15941 unicode_release_interned();
15942#endif /* __INSURE__ */
15943
15944 Py_CLEAR(unicode_empty);
15945
15946 for (Py_ssize_t i = 0; i < 256; i++) {
15947 Py_CLEAR(unicode_latin1[i]);
15948 }
15949 _PyUnicode_ClearStaticStrings();
15950 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015951
15952 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15953 PyMem_RawFree(interp->fs_codec.encoding);
15954 interp->fs_codec.encoding = NULL;
15955 PyMem_RawFree(interp->fs_codec.errors);
15956 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015957}
15958
15959
Georg Brandl66c221e2010-10-14 07:04:07 +000015960/* A _string module, to export formatter_parser and formatter_field_name_split
15961 to the string.Formatter class implemented in Python. */
15962
15963static PyMethodDef _string_methods[] = {
15964 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15965 METH_O, PyDoc_STR("split the argument as a field name")},
15966 {"formatter_parser", (PyCFunction) formatter_parser,
15967 METH_O, PyDoc_STR("parse the argument as a format string")},
15968 {NULL, NULL}
15969};
15970
15971static struct PyModuleDef _string_module = {
15972 PyModuleDef_HEAD_INIT,
15973 "_string",
15974 PyDoc_STR("string helper module"),
15975 0,
15976 _string_methods,
15977 NULL,
15978 NULL,
15979 NULL,
15980 NULL
15981};
15982
15983PyMODINIT_FUNC
15984PyInit__string(void)
15985{
15986 return PyModule_Create(&_string_module);
15987}
15988
15989
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015990#ifdef __cplusplus
15991}
15992#endif